Example #1
0
def to_bcolz(df, rootdir, expectedlen=None, reset_index=True, compute=True,
             overwrite=True, get=get_sync):
    """ Save dask DataFrame to BColz table
    Parameters
    ----------
    df: da.DataFrame
    rootdir: directory to save BColz table
    expectedlen: expected length of table
    """
    from bcolz import ctable

    if os.path.exists(rootdir):
        if overwrite:
            import shutil
            shutil.rmtree(rootdir)
        else:
            raise ValueError('Directory already exists')

    name = 'to-bcolz-' + uuid.uuid1().hex

    # Create empty ctable and append solution
    # dtype = [(name, dtype.str) for name, dtype in zip(ddf.columns, ddf.dtypes)]
    # dsk[(name, -1)] = (bcolz.fromiter, (), dtype, 0)  # Create empty table

    if expectedlen is None:
        # row_bytes = sum([d.itemsize for d in df.dtypes])
        # if reset_index:
        #     row_bytes += df.index.dtype.itemsize
        # chunksize = np.ceil(1e7 / row_bytes).astype(int)
        expectedlen = (lambda df, n: len(df)*n, (df._name, 0), df.npartitions)

    if reset_index:
        df = df.reset_index()

    dsk = dict()
    dsk[(name, -1)] = expectedlen
    dsk[(name, 0)] = (lambda df, exp_len, rt: ctable.fromdataframe(
        df, expectedlen=int(exp_len), rootdir=rt),
        (df._name, 0), (name, -1), rootdir)

    for i in range(1, df.npartitions):
        task = (lambda ct, df: ct.append(ctable.fromdataframe(df)),
                (name, 0), (df._name, i))
        dsk[(name, i)] = task

    dsk = merge(df.dask, dsk)
    keys = [(name, df.npartitions - 1)]
    if compute:
        return DataFrame._get(dsk, keys, get=get)
    else:
        return delayed([Delayed(key, [dsk]) for key in keys])
Example #2
0
    def to_ctable(self, raw_data, invalid_data_behavior):
        if isinstance(raw_data, ctable):
            # we already have a ctable so do nothing
            return raw_data

        # windorise the pricing fields plus volume and open interest
        winsorise_uint32(raw_data, invalid_data_behavior, "volume", *PRICING)
        winsorise_uint32(raw_data, invalid_data_behavior, "open_interest",
                         *PRICING)

        # process the pricing fields and greeks separatly (greeks signed)
        processed_pricing = (raw_data[list(PRICING)] *
                             1000).round().astype("uint32")
        processed_greeks = (raw_data[list(GREEKS)] *
                            1000).round().astype("int64")
        processed = pd.concat([processed_pricing, processed_greeks], axis=1)

        # process the dates
        dates = raw_data.index.values.astype("datetime64[s]")
        days_to_expiration = raw_data.days_to_expiration.values.astype(
            "timedelta64[D]")
        check_uint32_safe(dates.max().view(np.int64), "day")
        check_uint32_safe(days_to_expiration.max().view(np.int64),
                          "days_to_expiration")
        processed["day"] = dates.astype("uint32")
        processed["days_to_expiration"] = days_to_expiration.astype("uint32")
        processed["volume"] = raw_data.volume.astype("uint32")
        processed["open_interest"] = raw_data.open_interest.astype("uint32")

        return ctable.fromdataframe(processed)
Example #3
0
def to_ctable(raw_data, invalid_data_behavior):
    if isinstance(raw_data, ctable):
        # we already have a ctable so do nothing
        return raw_data
    winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC)
    processed = (raw_data[list(OHLC)] * OHLC_RATIO).astype('uint32')
    dates = raw_data.index.values.astype('datetime64[s]')
    check_uint32_safe(dates.max().view(np.int64), 'day')
    processed['day'] = dates.astype('uint32')
    processed['volume'] = raw_data.volume.astype('uint32')

    # case of options. Ideally, make separate functions / switch cases
    try:
        processed['open_interest'] = raw_data.open_interest.astype('uint32')
        # FIXME bis and ask should be uint for consistency
        processed['bid'] = (raw_data.bid * OHLC_RATIO).astype(numpy.uint32)
        processed['ask'] = (raw_data.ask * OHLC_RATIO).astype(numpy.uint32)
        processed['delta'] = (raw_data.delta * OHLC_RATIO).astype(numpy.int32)
        processed['gamma'] = (raw_data.gamma * OHLC_RATIO).astype(numpy.int32)
        processed['theta'] = (raw_data.theta * OHLC_RATIO).astype(numpy.int32)
        processed['vega'] = (raw_data.vega * OHLC_RATIO).astype(numpy.int32)
        processed['rho'] = (raw_data.rho * OHLC_RATIO).astype(numpy.int32)
        processed['iv'] = (raw_data.iv * OHLC_RATIO).astype(numpy.int32)
    except:
        pass
    return ctable.fromdataframe(processed)
Example #4
0
def shards(bcolz_dir, taxi_df):
    single_bcolz = str(bcolz_dir.join('yellow_tripdata_2016-01.bcolz'))
    ct = ctable.fromdataframe(taxi_df, rootdir=single_bcolz)

    step, remainder = divmod(len(ct), NR_SHARDS)
    count = 0
    shards = [single_bcolz]

    for idx in range(0, len(ct), step):
        print("Creating shard {}".format(count + 1))

        if idx == len(ct) * (NR_SHARDS - 1):
            step = step + remainder

        shard_file = str(bcolz_dir.join('tripdata_2016-01-%s.bcolzs' % count))
        ct_shard = bcolz.fromiter(ct.iter(idx, idx + step),
                                  ct.dtype,
                                  step,
                                  rootdir=shard_file,
                                  mode='w')
        shards.append(shard_file)

        ct_shard.flush()
        count += 1

    yield shards
Example #5
0
def to_ctable(raw_data, invalid_data_behavior):
    if isinstance(raw_data, ctable):
        # we already have a ctable so do nothing
        return raw_data
    winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC)
    processed = (raw_data[list(OHLC)] * OHLC_RATIO).astype('uint32')
    dates = raw_data.index.values.astype('datetime64[s]')
    check_uint32_safe(dates.max().view(np.int64), 'day')
    processed['day'] = dates.astype('uint32')
    processed['volume'] = raw_data.volume.astype('uint32')

    # case of options. Ideally, make separate functions / switch cases
    try:
        processed['open_interest'] = raw_data.open_interest.astype('uint32')
        # FIXME bis and ask should be uint for consistency
        processed['bid'] = (raw_data.bid * OHLC_RATIO).astype(numpy.uint32)
        processed['ask'] = (raw_data.ask * OHLC_RATIO).astype(numpy.uint32)
        processed['delta'] = (raw_data.delta * OHLC_RATIO).astype(numpy.int32)
        processed['gamma'] = (raw_data.gamma * OHLC_RATIO).astype(numpy.int32)
        processed['theta'] = (raw_data.theta * OHLC_RATIO).astype(numpy.int32)
        processed['vega'] = (raw_data.vega * OHLC_RATIO).astype(numpy.int32)
        processed['rho'] = (raw_data.rho * OHLC_RATIO).astype(numpy.int32)
        processed['iv'] = (raw_data.iv * OHLC_RATIO).astype(numpy.int32)
    except:
        pass
    return ctable.fromdataframe(processed)
Example #6
0
    def _raw_data_for_asset(self, asset_id):
        """
        Generate 'raw' data that encodes information about the asset.

        See class docstring for a description of the data format.
        """
        # Get the dates for which this asset existed according to our asset
        # info.
        dates = self._calendar[self._calendar.slice_indexer(
            self.asset_start(asset_id), self.asset_end(asset_id))]

        data = full(
            (len(dates), len(US_EQUITY_PRICING_BCOLZ_COLUMNS)),
            asset_id * (100 * 1000),
            dtype=uint32,
        )

        # Add 10,000 * column-index to OHLCV columns
        data[:, :5] += arange(5, dtype=uint32) * (10 * 1000)

        # Add days since Jan 1 2001 for OHLCV columns.
        data[:, :5] += (dates - self.PSEUDO_EPOCH).days[:, None].astype(uint32)

        frame = DataFrame(
            data,
            index=dates,
            columns=US_EQUITY_PRICING_BCOLZ_COLUMNS,
        )

        frame['day'] = nanos_to_seconds(dates.asi8)
        frame['id'] = asset_id

        return ctable.fromdataframe(frame)
Example #7
0
def test_downloader(redis_server, downloader, tmpdir):
    # Make a bcolz from a pandas DataFrame
    data_df = pd.DataFrame(
        data=np.random.rand(100, 10),
        columns=['col_{}'.format(i+1) for i in range(10)])
    local_bcolz = str(tmpdir.join('test_bcolz'))
    ctable.fromdataframe(data_df, rootdir=local_bcolz)

    assert os.path.isdir(local_bcolz)

    # Zip up the bcolz directory and upload to S3
    upload_dir = tmpdir.mkdir('upload')
    zipfile_path = bqueryd.util.zip_to_file(local_bcolz, str(upload_dir))[0]
    assert os.path.isfile(zipfile_path)

    upload_file = str(upload_dir.join('test.bcolz'))
    shutil.move(zipfile_path, upload_file)
    assert os.path.isfile(upload_file)

    s3_conn = downloader._get_s3_conn()[-1]

    with clean_bucket(s3_conn, 'bcolz') as bucket:
        bucket.put_object(Key='test.bcolz', Body=open(upload_file, 'rb'))

        uploads = [key.key for key in bucket.objects.all()]
        assert uploads == ['test.bcolz']

        # Construct the redis entry that the downloader is looking for
        progress_slot = '%s_%s' % (time.time() - 60, -1)
        node_filename_slot = '%s_%s' % (socket.gethostname(), 's3://bcolz/test.bcolz')
        ticket = str(uuid4())

        incoming_dir = os.path.join(bqueryd.INCOMING, ticket)
        assert not os.path.isdir(incoming_dir)

        redis_server.hset(bqueryd.REDIS_TICKET_KEY_PREFIX + ticket, node_filename_slot, progress_slot)

        # wait for the downloader to catch up
        sleep(10)

        # Check that incoming dir now has the test.bcolz file.
        assert os.listdir(incoming_dir) == ['test.bcolz']

        # Check that the progress slot has been updated
        updated_slot = redis_server.hget(bqueryd.REDIS_TICKET_KEY_PREFIX + ticket, node_filename_slot)
        assert updated_slot.split('_')[-1] == 'DONE'
Example #8
0
 def gen_tables(self, assets):
     """
     Read CSVs as DataFrames from our asset map.
     """
     dtypes = self._csv_dtypes
     for asset in assets:
         path = self._asset_map.get(asset)
         if path is None:
             raise KeyError("No path supplied for asset %s" % asset)
         data = read_csv(path, parse_dates=['day'], dtype=dtypes)
         yield asset, ctable.fromdataframe(data)
Example #9
0
 def gen_tables(self, assets):
     """
     Read CSVs as DataFrames from our asset map.
     """
     dtypes = self._csv_dtypes
     for asset in assets:
         path = self._asset_map.get(asset)
         if path is None:
             raise KeyError("No path supplied for asset %s" % asset)
         data = read_csv(path, parse_dates=['day'], dtype=dtypes)
         yield asset, ctable.fromdataframe(data)
Example #10
0
def to_ctable(raw_data, invalid_data_behavior):
    if isinstance(raw_data, ctable):
        # we already have a ctable so do nothing
        return raw_data

    winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC)
    processed = (raw_data[list(OHLC)] * 1000).astype('uint32')
    dates = raw_data.index.values.astype('datetime64[s]')
    check_uint32_safe(dates.max().view(np.int64), 'day')
    processed['day'] = dates.astype('uint32')
    processed['volume'] = raw_data.volume.astype('uint32')
    return ctable.fromdataframe(processed)
Example #11
0
def to_ctable(raw_data, invalid_data_behavior):
    if isinstance(raw_data, ctable):
        # we already have a ctable so do nothing
        return raw_data

    winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC)
    processed = (raw_data[list(OHLC)] * 1000).astype('uint32')
    dates = raw_data.index.values.astype('datetime64[s]')
    check_uint32_safe(dates.max().view(np.int64), 'day')
    processed['day'] = dates.astype('uint32')
    processed['volume'] = raw_data.volume.astype('uint32')
    return ctable.fromdataframe(processed)
    def to_ctable(self, raw_data, invalid_data_behavior):
        if isinstance(raw_data, ctable):
            # we already have a ctable so do nothing
            return raw_data

        winsorise_uint32(raw_data, invalid_data_behavior, "volume", *OHLC)
        processed = (raw_data[list(OHLC)] * 1000).round().astype("uint32")
        dates = raw_data.index.values.astype("datetime64[s]")
        check_uint32_safe(dates.max().view(np.int64), "day")
        processed["day"] = dates.astype("uint32")
        processed["volume"] = raw_data.volume.astype("uint32")
        return ctable.fromdataframe(processed)
Example #13
0
 def to_ctable(self, raw_data, invalid_data_behavior):
     if isinstance(raw_data, ctable):
         # we already have a ctable so do nothing
         return raw_data
     # # 检查OHLCV + 附加列数值是否溢出
     winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC.union(EXTRA_COLUMNS))
     # # 值列统一调整为uint32
     #processed = (raw_data[list(OHLC)] * 1000).astype('uint32')
     processed = (raw_data[list(OHLC.union(EXTRA_COLUMNS).union(['volume']))]).astype('uint32')
     dates = raw_data.index.values.astype('datetime64[s]')
     check_uint32_safe(dates.max().view(np.int64), 'day')
     processed['day'] = dates.astype('uint32')
     processed['volume'] = raw_data.volume.astype('uint32')
     return ctable.fromdataframe(processed)
Example #14
0
    def to_ctable(self, raw_data, invalid_data_behavior):
        if isinstance(raw_data, ctable):
            # we already have a ctable so do nothing
            return raw_data

        winsorise_uint32(raw_data, invalid_data_behavior, 'volume', *OHLC)
        processed = (raw_data[list(OHLC)] * 1000).round().astype('uint32')
        dates = raw_data.index.values.astype('datetime64[s]')
        check_uint32_safe(dates.max().view(np.int64), 'day')
        processed['day'] = dates.astype('uint32')
        processed['volume'] = raw_data.volume.astype('uint32')
        # 附加列同样转换为uint32
        for c in NON_ADJUSTED_COLUMN_FACTOR.keys():
            if c in raw_data.columns:
                processed[c] = (
                    raw_data.loc[:, c] *
                    NON_ADJUSTED_COLUMN_FACTOR.get(c, 1)).astype('uint32')
        return ctable.fromdataframe(processed)
Example #15
0
    def _raw_data_for_asset(self, asset_id):
        """
        Generate 'raw' data that encodes information about the asset.

        See class docstring for a description of the data format.
        """
        # Get the dates for which this asset existed according to our asset
        # info.
        dates = self._calendar[self._calendar.slice_indexer(self.asset_start(asset_id), self.asset_end(asset_id))]

        data = full((len(dates), len(US_EQUITY_PRICING_BCOLZ_COLUMNS)), asset_id * (100 * 1000), dtype=uint32)

        # Add 10,000 * column-index to OHLCV columns
        data[:, :5] += arange(5) * (10 * 1000)

        # Add days since Jan 1 2001 for OHLCV columns.
        data[:, :5] += (dates - self.PSEUDO_EPOCH).days[:, None]

        frame = DataFrame(data, index=dates, columns=US_EQUITY_PRICING_BCOLZ_COLUMNS)

        frame["day"] = nanos_to_seconds(dates.asi8)
        frame["id"] = asset_id

        return ctable.fromdataframe(frame)
Example #16
0
 def gen_tables(self, assets):
     for asset in assets:
         yield asset, ctable.fromdataframe(assets[asset])
Example #17
0
 def gen_tables(self, assets):
     for asset in assets:
         yield asset, ctable.fromdataframe(assets[asset])