Ejemplo n.º 1
0
def test_hour(tencent):
    tencent = tencent.iloc[:50]

    expect_cumulated_result(
        StockDataFrame(tencent, date_col=TIME_KEY,
                       time_frame='1h').cumulate().iloc[-1], tencent)
Ejemplo n.º 2
0
def test_cum_append_many_from_empty(tencent):
    stock = StockDataFrame(date_col=TIME_KEY,
                           time_frame='5m').cum_append(tencent.iloc[:LENGTH])

    expect_cumulated(tencent, stock, LENGTH)
Ejemplo n.º 3
0
def get_tencent():
    return StockDataFrame(read_csv(csv), date_column='time_key')
def test_main():
    StockDataFrame()
def prepareData(symbol, dataObtainer, startDate, endDate):
    df = dataObtainer.getHistoricalDataAsDataframe(symbol)

    # We gather all of the means for our inputs.
    closeMeans = []
    volumeMeans = []
    date = startDate

    dataTimeInterval = timedelta(hours=3)
    datapointsPerDay = 8
    numberOfSamples = 15 * 8

    while date < endDate:
        print("Processing", date, "/", endDate)
        startIndex = df.index[df["Timestamp"] == date].tolist()

        if len(startIndex) == 0:
            date += dataTimeInterval
            closeMeans.append(closeMeans[-1])
            volumeMeans.append(volumeMeans[-1])
            continue

        startIndex = startIndex[0]
        endIndex = df.index[df["Timestamp"] == date +
                            dataTimeInterval].tolist()

        if len(endIndex) == 0:
            date += dataTimeInterval
            closeMeans.append(closeMeans[-1])
            volumeMeans.append(volumeMeans[-1])
            continue

        endIndex = endIndex[0]
        data = df.iloc[startIndex:endIndex]
        closeMeans.append(data["Close"].mean())
        volumeMeans.append(data["Volume"].mean())
        date += dataTimeInterval

    stock = StockDataFrame({'close': closeMeans})

    # The standard RSI is 14 day.
    rsis = (stock["rsi:112"] / 100).tolist()
    rsis2 = (stock["rsi:56"] / 100).tolist()
    rsis3 = (stock["rsi:28"] / 100).tolist()
    mas = stock["macd:96,208,72"].tolist()
    mas2 = stock["macd:48,104,36"].tolist()
    bollUppers = stock["boll.upper:160"].tolist()
    bollLowers = stock["boll.lower:160"].tolist()

    import math
    rsis = [0 if math.isnan(x) else x for x in rsis]
    rsis2 = [0 if math.isnan(x) else x for x in rsis2]
    rsis3 = [0 if math.isnan(x) else x for x in rsis3]
    mas = [0 if math.isnan(x) else x for x in mas]
    mas2 = [0 if math.isnan(x) else x for x in mas2]
    bollUppers = [0 if math.isnan(x) else x for x in bollUppers]
    bollLowers = [0 if math.isnan(x) else x for x in bollLowers]

    outputIndex = 0
    entryAmount = int((len(closeMeans) - numberOfSamples - 1))
    formattedData = []

    for i in range(0, len(closeMeans) - numberOfSamples, datapointsPerDay):
        print("Percent of entries created: " + str(i / entryAmount * 100) +
              "%")
        close = closeMeans[i:i + numberOfSamples]
        meanClose = sum(close) / len(close)

        volume = volumeMeans[i:i + numberOfSamples]
        rsi = rsis[i:i + numberOfSamples]
        rsi2 = rsis2[i:i + numberOfSamples]
        rsi3 = rsis3[i:i + numberOfSamples]
        ma = mas[i:i + numberOfSamples]
        ma2 = mas2[i:i + numberOfSamples]
        maxMA = max(mas)
        ma = [((m / maxMA) + 1) / 2 for m in ma]
        bollUpper = bollUppers[i:i + numberOfSamples]
        maxBollUpper = max(bollUpper)
        bollUpper = [m / maxBollUpper for m in bollUpper]
        bollLower = bollLowers[i:i + numberOfSamples]
        maxBollLower = max(bollLower)
        bollLower = [m / maxBollLower for m in bollLower]
        maxClose = max(close)
        maxVolume = max(volume)

        for j in range(len(close)):
            close[j] /= maxClose

        for j in range(len(volume)):
            volume[j] /= maxVolume

        formattedData.append(
            [close, volume, rsi, rsi2, rsi3, ma, ma2, bollUpper, bollLower])
        outputIndex += 1

    return formattedData
Ejemplo n.º 6
0
def test_directive_stringify(stock: StockDataFrame):
    assert stock.directive_stringify('boll') == 'boll:20,close'
    assert directive_stringify('boll') == 'boll:20,close'
Ejemplo n.º 7
0
        auto_adjust = True,

        # download pre/post regular market hours data
        # (optional, default is False)
        prepost = True,

        # use threads for mass downloading? (True/False/Integer)
        # (optional, default is True)
        threads = True,

        # proxy URL scheme use use when downloading?
        # (optional, default is None)
        proxy = None
    )

stock=StockDataFrame(data)
# print(stock)
stock.alias('open','Open')
stock.alias('high','High')
stock.alias('low','Low')
stock.alias('close','Close')
print(stock)
cross_up_upper = stock['high'].copy()

# `cross_up_upper` is the series of high prices each of which cross up the upper bollinger band.
cross_up_upper[
    ~ stock['column:high > boll.upper']
] = np.nan
# Set some items of the series to `np.nan` so that mplfinance will not draw markers for those items.

cross_down_lower = stock['low'].copy()
Ejemplo n.º 8
0
    def createDataset(self, symbol: str, startDate, endDate, useAllIndicators=True,
                      isAugmenting=False, timePeriodForOutputs=24):
        """
        Creates a dataset. Please make sure that the start and end dates are
        the beginnings of days.
        :param symbol: e.g. "BTCUSDT"
        :param startDate: e.g. datetime(year=2020, month=1, day=1)
        :param endDate: e.g. datetime(year=2020, month=2, day=1)
        :param useAllIndicators: if False, only uses the minimum indicators
        :param isAugmenting: used by createAugmentedDataset when augmenting.
        :param timePeriodForOutputs: if set to 24, this will generate the labels
                                     (percentiles) for the next 24 hours after
                                     the 15-day period that appears in the input.
        """
        # These are time-related variables.
        timezone = "Etc/GMT-0"
        timezone = pytz.timezone(timezone)
        outputStartDate = startDate
        # We need to go back a little earlier to generate indicators such as RSI.
        startDate -= timedelta(days=DAYS_IN_AN_INPUT + 60)
        endDate = timezone.localize(endDate)
        startDate = timezone.localize(startDate)
        # outputStartDate = timezone.localize(outputStartDate)

        # We will be collecting our final features and labels in here:
        self.inputData = []
        self.outputData = []

        # This dataframe has all the raw data we need to generate the dataset.
        df = self.dataObtainer.getHistoricalDataAsDataframe(symbol)

        # First, we will gather all of the means for our inputs...
        closeMeans = []
        volumeMeans = []

        # ... also, we will gather the outputs, which represent the
        # distributions of the next day prices.
        output15thPercentiles = []
        output25thPercentiles = []
        output35thPercentiles = []
        outputMedians = []
        output65thPercentiles = []
        output75thPercentiles = []
        output85thPercentiles = []

        # We will use this to normalize our outputs by dividing them by the
        # mean price of the last (latest/most recent) day in our input.
        priceMeansToDivideLabelsBy = []
        volumeMeansToDivideLabelsBy = []
        date = startDate

        # For augmentation:
        phaseShift = uniform(0, np.pi * 2)
        count = 0

        # Now we will be collecting the input prices, input volumes, and output
        # percentiles.
        while date < endDate:
            print("Processing", date, "/", endDate)
            # First, we will collect the start and end dates for this input
            # point (which consists of 3 hours of data if that is our input
            # time interval). Then we calculate the mean price and volume for
            # this input data point.
            startIndex = df.index[df["Timestamp"] == date].tolist()

            # If this if condition is true, then we may be missing some data in
            # our dataset. I think this happens during times when Binance was
            # down. In this case, we just use the previous data.
            if len(startIndex) == 0:
                date += self._dataTimeInterval
                closeMeans.append(closeMeans[-1])
                volumeMeans.append(volumeMeans[-1])
                outputMedians.append(outputMedians[-1])
                output15thPercentiles.append(output15thPercentiles[-1])
                output25thPercentiles.append(output25thPercentiles[-1])
                output35thPercentiles.append(output35thPercentiles[-1])
                output65thPercentiles.append(output65thPercentiles[-1])
                output75thPercentiles.append(output75thPercentiles[-1])
                output85thPercentiles.append(output85thPercentiles[-1])
                priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1])
                volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1])
                continue

            startIndex = startIndex[0]
            endIndex = df.index[df["Timestamp"] == date + self._dataTimeInterval].tolist()

            if len(endIndex) == 0:
                date += self._dataTimeInterval
                closeMeans.append(closeMeans[-1])
                volumeMeans.append(volumeMeans[-1])
                outputMedians.append(outputMedians[-1])
                output15thPercentiles.append(output15thPercentiles[-1])
                output25thPercentiles.append(output25thPercentiles[-1])
                output35thPercentiles.append(output35thPercentiles[-1])
                output65thPercentiles.append(output65thPercentiles[-1])
                output75thPercentiles.append(output75thPercentiles[-1])
                output85thPercentiles.append(output85thPercentiles[-1])
                priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1])
                volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1])
                continue

            endIndex = endIndex[0]
            data = df.iloc[startIndex : endIndex]

            if isAugmenting:
                x = phaseShift + count
                augmentation = 1 + np.sin(x) * uniform(0.02, 0.04)
                closeMeans.append(data["Close"].mean() * augmentation)
                volumeMeans.append(data["Volume"].mean() * augmentation)
                count += uniform(0.3, 0.6)

                if count > 2 * np.pi:
                    count = 0

            else:
                closeMeans.append(data["Close"].mean())
                volumeMeans.append(data["Volume"].mean())

            # Now we get the start and end dates for output data that would
            # be associated with an entry that begins at the data point found
            # above. Then we calculate the percentiles for the output.
            date2 = date + timedelta(days=DAYS_IN_AN_INPUT)
            startIndex = df.index[df["Timestamp"] == date2].tolist()

            if len(startIndex) == 0:
                date += self._dataTimeInterval
                outputMedians.append(outputMedians[-1])
                output15thPercentiles.append(output15thPercentiles[-1])
                output25thPercentiles.append(output25thPercentiles[-1])
                output35thPercentiles.append(output35thPercentiles[-1])
                output65thPercentiles.append(output65thPercentiles[-1])
                output75thPercentiles.append(output75thPercentiles[-1])
                output85thPercentiles.append(output85thPercentiles[-1])
                priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1])
                volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1])
                continue

            startIndex = startIndex[0]
            date2 += timedelta(hours=timePeriodForOutputs)
            endIndex = df.index[df["Timestamp"] == date2].tolist()

            if len(endIndex) == 0:
                date += self._dataTimeInterval
                outputMedians.append(outputMedians[-1])
                output15thPercentiles.append(output15thPercentiles[-1])
                output25thPercentiles.append(output25thPercentiles[-1])
                output35thPercentiles.append(output35thPercentiles[-1])
                output65thPercentiles.append(output65thPercentiles[-1])
                output75thPercentiles.append(output75thPercentiles[-1])
                output85thPercentiles.append(output85thPercentiles[-1])
                priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1])
                volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1])
                continue

            endIndex = endIndex[0]
            data = df.iloc[startIndex: endIndex]["Close"]
            outputMedians.append(data.median())
            output15thPercentiles.append(data.quantile(0.15))
            output25thPercentiles.append(data.quantile(0.25))
            output35thPercentiles.append(data.quantile(0.35))
            output65thPercentiles.append(data.quantile(0.65))
            output75thPercentiles.append(data.quantile(0.75))
            output85thPercentiles.append(data.quantile(0.85))

            # Lastly, we need to get the last input day's mean price, which we
            # use to normalize our output percentiles.
            date3 = date + timedelta(days=DAYS_IN_AN_INPUT - 1)
            startIndex = df.index[df["Timestamp"] == date3].tolist()

            if len(startIndex) == 0:
                date += self._dataTimeInterval
                priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1])
                volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1])
                continue

            startIndex = startIndex[0]
            date3 = date + timedelta(days=DAYS_IN_AN_INPUT)
            endIndex = df.index[df["Timestamp"] == date3].tolist()

            if len(endIndex) == 0:
                date += self._dataTimeInterval
                priceMeansToDivideLabelsBy.append(priceMeansToDivideLabelsBy[-1])
                volumeMeansToDivideLabelsBy.append(volumeMeansToDivideLabelsBy[-1])
                continue

            endIndex = endIndex[0]
            data = df.iloc[startIndex: endIndex]
            priceMeansToDivideLabelsBy.append(data["Close"].mean())
            volumeMeansToDivideLabelsBy.append(data["Volume"].mean())
            date += self._dataTimeInterval

        # Now that our while loop above collected data for inputs and
        # outputs, we need to generate technical indicators as additional
        # input features. We seem to be getting good performance if we only
        # use close, volume, rsi, ema and mfi, but we also have some other
        # indicators to play around with, such as ma and an additional rsi
        # with a different parameter.
        stock = StockDataFrame({
            "close": closeMeans,
            "volume": volumeMeans
        })

        # The standard RSI is 14 day. Note that if our time interval is 3 hrs,
        # there are 8 data points in a day. Thus, a 14 day RSI is a 112-RSI
        # because 14 * 8 = 112.
        rsis = (stock["rsi:112"] / 100).tolist()
        rsis2 = (stock["rsi:14"] / 100).tolist()
        emas = (stock["ema:21"]).tolist()
        macds = stock["macd:96,208"].tolist()
        macds2 = stock["macd:24,52"].tolist()
        bollUppers = stock["boll.upper:160"].tolist()
        bollLowers = stock["boll.lower:160"].tolist()
        from ta.volume import MFIIndicator
        moneyFlowIndex = MFIIndicator(stock["close"], stock["close"], stock["close"], stock["volume"], window=14)
        mfis = (moneyFlowIndex.money_flow_index().divide(100)).to_list()

        # This gets rid of NANs in our indicators (just in case).
        import math
        rsis = [0 if math.isnan(x) else x for x in rsis]
        rsis2 = [0 if math.isnan(x) else x for x in rsis2]
        emas = [0 if math.isnan(x) else x for x in emas]
        macds = [0 if math.isnan(x) else x for x in macds]
        macds2 = [0 if math.isnan(x) else x for x in macds2]
        bollUppers = [0 if math.isnan(x) else x for x in bollUppers]
        bollLowers = [0 if math.isnan(x) else x for x in bollLowers]
        mfis = [0 if math.isnan(x) else x for x in mfis]

        # Now we will generate our final inputs and outputs! See the for loop
        # below.
        entryAmount = int((len(closeMeans) - self._numberOfSamples - 1))

        if self.dayByDay:
            advanceAmount = self._datapointsPerDay
        else:
            advanceAmount = 1

        def fixWithin0And1(x):
            return min(max(x, 0.0), 1.0)

        for i in range(60 * self._datapointsPerDay, entryAmount, advanceAmount):
            print("Percent of entries created: " + str(i / entryAmount * 100) + "%")
            yesterdayCloseMean = priceMeansToDivideLabelsBy[i]
            yesterdayVolumeMean = volumeMeansToDivideLabelsBy[i]
            # This gets the input features and outputs for this dataset entry.
            close = closeMeans[i : i + self._numberOfSamples]
            volume = volumeMeans[i : i + self._numberOfSamples]
            rsi = rsis[i : i + self._numberOfSamples]
            rsi2 = rsis2[i: i + self._numberOfSamples]
            ema = emas[i: i + self._numberOfSamples]
            macd = macds[i: i + self._numberOfSamples]
            macd2 = macds2[i: i + self._numberOfSamples]
            ema = [fixWithin0And1(m / yesterdayCloseMean / 2) for m in ema]
            macd = [fixWithin0And1(m / yesterdayCloseMean / 2 + 0.5) for m in macd]
            macd2 = [fixWithin0And1(m / yesterdayCloseMean / 2 + 0.5) for m in macd2]
            mfi = mfis[i: i + self._numberOfSamples]
            bollUpper = bollUppers[i: i + self._numberOfSamples]
            bollUpper = [fixWithin0And1(m / yesterdayCloseMean / 2) for m in bollUpper]
            bollLower = bollLowers[i: i + self._numberOfSamples]
            bollLower = [fixWithin0And1(m / yesterdayCloseMean / 2) for m in bollLower]

            for j in range(len(close)):
                close[j] = fixWithin0And1(close[j] / yesterdayCloseMean / 2)

            for j in range(len(volume)):
                volume[j] = fixWithin0And1(volume[j] / yesterdayVolumeMean / 2)

            # Finally, we add the entry to the dataset.
            if useAllIndicators:
                self.inputData.append([close, volume, rsi, rsi2, ema, macd, macd2,
                                       bollUpper, bollLower, mfi])
            else:
                self.inputData.append([close, volume, rsi, ema, mfi])

            # This normalizes our data. 0.5 means that the percentile is the same
            # as the last day's mean. 1.0 means that the percentile is twice the
            # value of the last day's mean. We normalize in this way so that we
            # can use the sigmoid activation function for the outputs, which

            output15thPercentile = output15thPercentiles[i] / yesterdayCloseMean / 2
            output25thPercentile = output25thPercentiles[i] / yesterdayCloseMean / 2
            output35thPercentile = output35thPercentiles[i] / yesterdayCloseMean / 2
            outputMedian = outputMedians[i] / yesterdayCloseMean / 2
            output65thPercentile = output65thPercentiles[i] / yesterdayCloseMean / 2
            output75thPercentile = output75thPercentiles[i] / yesterdayCloseMean / 2
            output85thPercentile = output85thPercentiles[i] / yesterdayCloseMean / 2
            self.outputData.append([
                                    output15thPercentile,
                                    output25thPercentile,
                                    output35thPercentile,
                                    outputMedian,
                                    output65thPercentile,
                                    output75thPercentile,
                                    output85thPercentile
            ])
Ejemplo n.º 9
0
import yfinance as yf
import pandas as pd
from stock_pandas import StockDataFrame
import generate_dataframe as gd 
import stock_dataframe_to_stdf as sdts 
import stock_pandas_to_stdf as spts 
import matplotlib
import matplotlib.pyplot as plt 
import numpy as np 

data = gd.generate_df("AAPL","1y","1d")
columns = ['open','close','high','low','volume','amount']
stockdf = sdts.generate_stdf(data)
stock = sdts.data_processing(columns,stockdf)
df = sdts.to_df(stock)
stockdf = spts.data_processing(StockDataFrame(df))
df = sdts.to_df(stockdf)

# dropping columns with at least one nan value
temp_df = df.dropna(axis=1) 
print(temp_df.head())
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numeric_df = temp_df.select_dtypes(include=numerics)
# print(numeric_df.shape)
# numeric_df.drop(["Date"],axis=1)
# print(numeric_df.head())
import seaborn as sns 
# 
sns.heatmap(numeric_df.corr())
plt.show()