Ejemplo n.º 1
0
def series_by_filename(tag, clean_store_dirpath):
    """Returns dictionary with path for files already in database, as defined
    as filename tag present.

    :param tag: 'filename'
    :param clean_store_dirpath: base path
    :return: {filename: filepath}
    """
    database = 'sec_master'
    cql = 'SHOW TAG VALUES ON \"{}\" ' \
          'WITH KEY=\"{}\"'.format(database,
                                   tag)
    cql_response = db_man.influx_qry(cql).items()

    if cql_response:
        response = cql_response[0][1]
    else:
        response = cql_response

    # https://stackoverflow.com/a/39537308/3512107
    ans = OrderedDict()
    for resp in response:
        filename = resp['value']
        store_path = store_path_constructor(filename=filename,
                                            dir_path=clean_store_dirpath)
        ans[filename] = store_path
    return ans
Ejemplo n.º 2
0
def time_bounds(table, tags, position=('FIRST', 'LAST')):
    """ Get the first and/or last datetime for a series in a table
    a Series is defined by its tags
    """

    # We need one field key to make the query and get the time, for select *
    # returns UNIX time.
    field_keys = get_field_keys(table)
    field_key_to_qry = field_keys[0]['fieldKey']

    # construct WHERE part of the cql query, using all tags provided
    where_cql = 'WHERE '
    for k, islast in iter_islast(tags):
        constructor = '\"' + k + '\"=' + '\'' + tags[k] + '\' '
        where_cql = where_cql + constructor
        if not islast:
            where_cql += 'AND '

    ans = {}
    for each_position in position:
        cql = 'SELECT {}({}) ' \
              'FROM \"{}\" {}'.format(each_position,
                                      field_key_to_qry,
                                      table,
                                      where_cql)

        response = db_man.influx_qry(cql).get_points()

        time_on_db = pd.to_datetime([t for t in response][0]['time'])

        # adjust to lower minute
        # construct the answer as a dict
        ans[each_position.lower()] = one_minute_adjustment(time_on_db)

    return ans
Ejemplo n.º 3
0
def series_by_filename(tag, clean_store_dirpath):
    """Returns dictionary with path for files already in database, as defined
    as filename tag present.

    :param tag: 'filename'
    :param clean_store_dirpath: base path
    :return: {filename: filepath}
    """
    database = 'sec_master'
    cql = 'SHOW TAG VALUES ON \"{}\" ' \
          'WITH KEY=\"{}\"'.format(database,
                                   tag)
    cql_response = db_man.influx_qry(cql).items()

    if cql_response:
        response = cql_response[0][1]
    else:
        response = cql_response

    # https://stackoverflow.com/a/39537308/3512107
    ans = OrderedDict()
    for resp in response:
        filename = resp['value']
        store_path = store_path_constructor(filename=filename,
                                            dir_path=clean_store_dirpath)
        ans[filename] = store_path
    return ans
Ejemplo n.º 4
0
def get_series_info(table):
    """Returns tags of each series in a table

    :return: list of dictionaries
    """
    logger.info('Querying series info in table \'{}\''.format(table))

    # get series by symbols - provider tags - frequency (if bars)
    cql = 'SHOW TAG VALUES ON ' \
          '"securities_master" FROM "{}" ' \
          'WITH KEY IN (\"{}\", \"{}\", \"{}\")'.format(table,
                                                        'provider',
                                                        'symbol',
                                                        'frequency')

    response = db_man.influx_qry(cql).get_points()

    provs = []
    symb = []
    freq = []
    for resp in response:
        if resp['key'] == 'provider':
            provs.append(resp['value'])
        elif resp['key'] == 'symbol':
            symb.append(resp['value'])
        elif resp['key'] == 'frequency':
            freq.append(resp['value'])

    # the fx_ticks table does not include a frequency tag, I was aware of its
    # utility after all series were inserted. At the moment influx does not
    # support adding tags to existing series. A request is open:
    # https://github.com/influxdata/influxdb/issues/3904
    if not freq:
        freq.append('')

    series_tags = {'symbol': symb,
                   'provider': provs,
                   'frequency': freq}

    # construct all possibilities
    # Cartesian product of a dictionary of lists
    product_series_tags = product_dict(series_tags)

    ans = []
    for tag_product in product_series_tags:
        # add time bounds for each series
        bounds = time_bounds(table, tags=tag_product)

        ids = {'provider': tag_product['provider'],
               'symbol': tag_product['symbol']}
        data = {'first': bounds['first'],
                'last': bounds['last'],
                'frequency': tag_product['frequency']}

        # construct answer list of dictionaries
        ans.append({'id': ids,
                    'data': data})

    return ans
Ejemplo n.º 5
0
def get_field_keys(table):
    """ Field keys for a selected table

    :param table:
    :return: list op dictionaries
    """
    cql = 'SHOW FIELD KEYS FROM \"{}\"'.format(table)
    response = db_man.influx_qry(cql).get_points()

    return [x for x in response]
Ejemplo n.º 6
0
def tick_resampling(input_table, output_table, tags, start_datetime,
                    end_datetime):
    """Re sample tick data in securities master to desired frequency

    """
    # Define the time extension of each query.
    # The bigger the number, more RAM needed.
    delta = datetime.timedelta(hours=24)

    # Construct the intervals to obtain the data in chunks
    chuncks = []
    init_dt = start_datetime
    while init_dt < end_datetime:

        end_dt = init_dt + delta
        chuncks.append((init_dt, end_dt))

        init_dt = end_dt

    # works with each query: re sampling and insert
    for each_chunck in chuncks:

        cql = 'SELECT time, bid, ask FROM {} ' \
              'WHERE symbol=\'{}\' ' \
              'AND provider=\'{}\' ' \
              'AND time>=\'{}\' ' \
              'AND time<\'{}\''.format(input_table,
                                       tags['symbol'],
                                       tags['provider'],
                                       each_chunck[0],
                                       each_chunck[1])

        # Get the ticks requested
        response = db_man.influx_qry(client_type='dataframe', cql=cql)

        # check for the weekends --no data--
        if not response:
            logger.warning('No data for {} at {}'.format(tags.values(),
                                                         each_chunck[0]))
        else:
            logger.info('Re sampling {} from {} delta {}'.format(tags.values(),
                                                                 each_chunck[0],
                                                                 delta))
            ticks = response[input_table]

            # Call the re sampling function
            bars = ticks_to_bars(ticks=ticks, freq=tags['frequency'])

            # Insert into securities master database
            field_keys = ['open', 'high', 'low', 'close']

            db_man.influx_writer(data=bars,
                                 field_columns=field_keys,
                                 tags=tags,
                                 into_table=output_table)
Ejemplo n.º 7
0
def series_by_filename_row(table, clean_store_dirpath, abs_tolerance=10):
    """Returns dictionary with path for files already in database, as defined
    as filename tag present and checking row_count in database vs CSV

    :param table: table name
    :param clean_store_dirpath: base path
    :param abs_tolerance:
    :return: {filename: {row_count, filepath}
    """
    # Get the filename tags in the database
    tags_by_filename = \
        series_by_filename(tag='filename',
                           clean_store_dirpath=clean_store_dirpath)

    # For each series in database
    ans = dict()
    for each_filename, each_path in tags_by_filename.items():
        # query row count
        cql = 'SELECT COUNT(bid) ' \
              'FROM {} ' \
              'WHERE filename=\'{}\''.format(table,
                                             each_filename)
        cql_response = db_man.influx_qry(cql).items()

        row_count_db = next(cql_response[0][1])['count']
        # get row count in csv

        row_count_csv = sum(1 for _r in opengz(each_path, 'r')) - 1

        # compare the two results
        difference = abs(row_count_db - row_count_csv)
        if difference <= abs_tolerance:
            logger.info('{} already in database with {} data points and {} '
                        'difference'.format(each_filename,
                                            row_count_db,
                                            difference))
            ans[each_filename] = each_path
        else:
            logger.warning('Incomplete series {} deleted, '
                           'difference {}'.format(each_filename,
                                                  difference))
            # if difference is greater the series is incomplete, something
            # went wrong. Delete it !
            db_man.delete_series(tags={'filename': each_filename})

    return ans
Ejemplo n.º 8
0
def series_by_filename_row(table, clean_store_dirpath, abs_tolerance=10):
    """Returns dictionary with path for files already in database, as defined
    as filename tag present and checking row_count in database vs CSV

    :param table: table name
    :param clean_store_dirpath: base path
    :param abs_tolerance:
    :return: {filename: {row_count, filepath}
    """
    # Get the filename tags in the database
    tags_by_filename = \
        series_by_filename(tag='filename',
                           clean_store_dirpath=clean_store_dirpath)

    # For each series in database
    ans = dict()
    for each_filename, each_path in tags_by_filename.items():
        # query row count
        cql = 'SELECT COUNT(bid) ' \
              'FROM {} ' \
              'WHERE filename=\'{}\''.format(table,
                                             each_filename)
        cql_response = db_man.influx_qry(cql).items()

        row_count_db = next(cql_response[0][1])['count']
        # get row count in csv

        row_count_csv = sum(1 for _r in opengz(each_path, 'r')) - 1

        # compare the two results
        difference = abs(row_count_db - row_count_csv)
        if difference <= abs_tolerance:
            logger.info('{} already in database with {} data points and {} '
                        'difference'.format(each_filename, row_count_db,
                                            difference))
            ans[each_filename] = each_path
        else:
            logger.warning('Incomplete series {} deleted, '
                           'difference {}'.format(each_filename, difference))
            # if difference is greater the series is incomplete, something
            # went wrong. Delete it !
            db_man.delete_series(tags={'filename': each_filename})

    return ans