def test1(self):

        w = QuoteWriter('/Users/Zibin/NYU/qwriter.gz', 1)  # Create a .gz file

        r = TAQQuotesReader(
            '/Users/Zibin/Documents/R/quotes/20070620/NVDA_quotes.binRQ'
        )  # Read a file
        # Using tested readers, test for expected values
        baseTS = r.getSecsFromEpocToMidn() * 1000
        ts = baseTS + r.getMillisFromMidn(0)

        asks = r.getAskSize(0)
        bs = r.getBidSize(0)
        ap = r.getAskPrice(0)
        bp = r.getBidPrice(0)

        w.writer(r, 1, 1)  # write to the file

        b = BinReader('/Users/Zibin/NYU/qwriter.gz', '>QIIfIf',
                      100)  # use binreader to read
        ts0, _, asksize, askprice, bidsize, bidprice = b.next()
        b.close()

        self.assertEquals(ts0, ts)  # Check if time stamps are equal
        self.assertEquals(asksize, asks)  # Check if ask sizes are equal
        self.assertEquals(bidsize, bs)  # Check if bid sizes are equal
        self.assertAlmostEquals(bidprice, bp)  # Check if bid prices are equal
        self.assertAlmostEquals(askprice, ap)  # Check if ask prices are equal
Esempio n. 2
0
    def __init__(self, filePath, ty):
        '''
        Parameters
        ------
        filePath : str
            The file path of the data
            
        ty : str
            type of the data, either 'trades' or 'quotes'
        '''

        # Read the data in
        if ty == "trades":
            reader = BinReader(filePath, '>QIIf', 100)
            self._ts = []
            self._pr = []
            while reader.hasNext():
                now = reader.next()
                self._ts.append(now[0])
                self._pr.append(now[3])

        elif ty == "quotes":
            reader = BinReader(filePath, '>QIIfIf', 100)
            self._ts = []
            self._pr = []
            while reader.hasNext():
                now = reader.next()
                self._ts.append(now[0])
                self._pr.append((now[3] + now[5]) / 2)
Esempio n. 3
0
    def sampleLengthDay(self, date):
        # date format is 'YYYYMMDD'

        if self._ty == 'trades':
            data = BinReader(self._path, '>QIIf', 100)
        elif self._ty == 'quotes':
            data = BinReader(self._path, '>QIIfIf', 100)

        dateStart = datetime.strptime(date, '%Y%m%d')
        dateEnd = datetime.strptime(str(int(date) + 1), '%Y%m%d')

        ts = []  # List to store time stampe
        while data.hasNext():
            t = data.next()[0]  # Time stamp
            ts.append(t)

        for i in range(len(ts)):
            t = datetime.fromtimestamp(
                ts[i] / 1000)  # Convert milliseconds to DateTime
            ts[i] = t

        sampleLength = 0  # Counter to count the sample
        for time in ts:
            if time > dateStart and time < dateEnd:  # Condition to count the number of trades
                sampleLength += 1
            if time > dateEnd:  # End loop if the time already passes the target day
                break

        data.close()
        return sampleLength
Esempio n. 4
0
    def TAQNumber(self):

        if self._ty == 'trades':
            data = BinReader(self._path, '>QIIf', 100)
        elif self._ty == 'quotes':
            data = BinReader(self._path, '>QIIfIf', 100)

        count = 0

        while data.hasNext():
            count += 1
            data.next()

        data.close()

        return count
Esempio n. 5
0
    def test1(self):
        
        dirc = '/Users/Zibin/NYU/twriter.gz'

        w = TradeWriter(dirc, 1) # Create a .gz file on directory
        
        r = TAQTradesReader('/Users/Zibin/Documents/R/trades/20070620/NVDA_trades.binRT') # Read a file
        # Using tested readers, test for expected values
        baseTS = r.getSecsFromEpocToMidn() * 1000       
        ts = baseTS + r.getMillisFromMidn(0)
 
        s = r.getSize(0) 
        p = r.getPrice(0)
        
        w.writer(r,1,1) # write to the file
        
        b = BinReader(dirc, '>QIIf', 100) # use binreader to read
        ts0, _, size0, p0 = b.next()
        b.close()
        
        self.assertEquals( ts0, ts ) # Check if time stamps are equal
        self.assertEquals( size0, s ) # Check if numbers of shares are equal
        self.assertAlmostEquals( p0, p ) # Check if prices are equal
class TradeReturn(object):
    '''
    A class that take a binary trade data file and calculate the lag return
    
    Attributes
    ------
    _Path : str
        Path to the directory of the data read
    
    Method
    ------
    GetReturn(tick, lagT)
        Get lagT lag return on tick
    '''
    def __init__(self, Path):
        '''
        Parameters
        ------
        Path : str
            The path to the directory which contains the binary trade data files
        '''
        self._Path = Path

    def GetReturn(self, tick, lagT):
        '''
        Get the lag return of a particular ticker trades 
        
        Parameters
        ------
        tick : str
            The ticker we want to compute the return from
            
        lagT : int
            The lag, in millisecond, we want to use to compute lag return 
        '''
        # read the trade file
        self._tReader = BinReader(
            os.path.join(self._Path, tick + "_trades.gz"), '>QIIf', 100)

        # eight hour in milisecond
        eighthour = int(2.88e7)

        r_lag = []
        # the next time we are looking for in the list
        find_t = self._tReader.getSN() + lagT
        # setup
        now = self._tReader.next()
        # the previous time we have observed
        pre_t = now[0]
        # the previous price we have observed
        pre_p = now[3]
        # the base price we use the calculate the return
        base_p = now[3]
        # a time stamp to check if we go to tomorrow data
        today = now[0]

        # loop through the data
        while self._tReader.hasNext():
            now = self._tReader.next()
            # get the current time stamp
            t = now[0]

            if t > today + eighthour:
                # if the current time is after the market time,
                # go to tomorrow and start again
                today = t
                base_p = now[3]

            elif t > find_t:
                # if the current time is greater than the time,
                # use the previous price to calculate the lag return
                r_lag.append(pre_p / base_p - 1)
                base_p = pre_p
                # the next time we are looking for
                find_t = pre_t + lagT
            elif t == find_t:
                # if the current time is exactly the time we are looking for,
                # calculate the return
                r_lag.append(now[3] / base_p - 1)
                base_p = now[3]
                find_t += lagT
            pre_p = now[3]
            pre_t = now[0]

        return r_lag
def ori_vs_adj_midQuote_MTW():   
    dayStartMTW = '20070910' # Start day
    dayEndMTW = '20070911' # End day
    
    parentddir1 = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
    parentddir2 = os.path.abspath(os.path.join(parentddir1, os.path.pardir))
    
    oriQuoteStartMTW = TAQQuotesReader(os.path.join(parentddir2, 'R/quotes/'+dayStartMTW+'/MTW_quotes.binRQ'))
    oriQuoteEndMTW = TAQQuotesReader(os.path.join(parentddir2, 'R/quotes/'+dayEndMTW+'/MTW_quotes.binRQ'))
    
    oriPriceMTW = [] # List to store price
    oriShareMTW = [] # List to store number of shares
    timeList = []
    
    # Append data to each of the lists
    for i in range(oriQuoteStartMTW.getN()):
        oriPriceMTW.append((oriQuoteStartMTW.getAskPrice(i) + oriQuoteStartMTW.getBidPrice(i))/2)
        oriShareMTW.append((oriQuoteStartMTW.getAskSize(i) + oriQuoteStartMTW.getBidSize(i))/2)
        timeList.append(1189396800000 + oriQuoteStartMTW.getMillisFromMidn(i))
        
    for i in range(oriQuoteEndMTW.getN()):
        oriPriceMTW.append((oriQuoteEndMTW.getAskPrice(i) + oriQuoteEndMTW.getBidPrice(i))/2)
        oriShareMTW.append((oriQuoteEndMTW.getAskSize(i) + oriQuoteEndMTW.getBidSize(i))/2)
        timeList.append(1189483200000 + oriQuoteEndMTW.getMillisFromMidn(i))
    
    dateTimeList = []
    for timeStamp in timeList:
        dateTimeList.append(datetime.fromtimestamp(timeStamp/1000))
    
    adjQuoteStartMTW = BinReader(os.path.join(os.getcwd(), 'adjusted_quotes/MTW_quotes.gz'), '>QIIfIf', 100)
    ts0, _, asize0, askp, bsize0, bidp = adjQuoteStartMTW.next()
    
    ts = []
    ts.append(ts0)
    share = []
    share.append(int((asize0 + bsize0)/2))
    price = []
    price.append((askp+bidp)/2)
    
    while adjQuoteStartMTW.hasNext():
        t, _, asize, ap, bsize, bp = adjQuoteStartMTW.next()
        ts.append(t)
        share.append(int((asize+bsize)/2))
        price.append((ap+bp)/2)
    
    sIdx = binarySearch(ts, 0, len(ts)-1, timeList[0])
    eIdx = binarySearch(ts, 0, len(ts)-1, timeList[-1])
    
    pList = price[sIdx:eIdx+1]
    sList = share[sIdx:eIdx+1]
    
    adjQuoteStartMTW.close()
      
    plt.figure(figsize=(9,6))
    ax = plt.gca()
    xfmt = mdates.DateFormatter('%m-%d %H:%M')
    ax.xaxis.set_major_formatter(xfmt)
    plt.plot(dateTimeList[:-6], oriPriceMTW[:-6], ls = 'None', marker = 'o', ms = 5, color = 'y', label = 'original')
    plt.plot(dateTimeList[:-6], pList, ls = 'None', marker = 'x', ms = 2 , color = 'black', label = 'adjusted')
    plt.xticks([datetime.strptime('2007-09-10 10:00:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-10 15:00:00', '%Y-%m-%d %H:%M:%S'),
                datetime.strptime('2007-09-11 09:30:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-11 15:00:00', '%Y-%m-%d %H:%M:%S')])
    plt.title('Original MTW Mid-Quote Price VS Adjusted MTW Mid-Quote Price')
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.legend()
    plt.show()
     
    plt.figure(figsize=(9,6))
    ax = plt.gca()
    xfmt = mdates.DateFormatter('%m-%d %H:%M')
    ax.xaxis.set_major_formatter(xfmt)
    plt.plot(dateTimeList[:-6], oriShareMTW[:-6], ls = 'None', marker = 'o', ms = 5, color = 'y', label = 'original')
    plt.plot(dateTimeList[:-6], sList, ls = 'None', marker = 'x', ms = 2 , color = 'black', label = 'adjusted')
    plt.xticks([datetime.strptime('2007-09-10 10:00:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-10 15:00:00', '%Y-%m-%d %H:%M:%S'),
                datetime.strptime('2007-09-11 09:30:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-11 15:00:00', '%Y-%m-%d %H:%M:%S')])
    plt.ylim(0,200)
    plt.title('Original MTW Mid-Quote Share VS Adjusted MTW Mid-Quote Share')
    plt.xlabel('Time')
    plt.ylabel('Number of Shares')
    plt.legend()
    plt.show()
def ori_vs_adj_trade_NVDA():
    dayStartNVDA = '20070910' # Start day
    dayEndNVDA = '20070911' # End day
    
    parentddir1 = os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir))
    parentddir2 = os.path.abspath(os.path.join(parentddir1, os.path.pardir))

    oriTradeStartNVDA = TAQTradesReader(os.path.join(parentddir2, 'R/trades/'+dayStartNVDA+'/NVDA_trades.binRT'))
    oriTradeEndNVDA = TAQTradesReader(os.path.join(parentddir2, 'R/trades/'+dayEndNVDA+'/NVDA_trades.binRT'))
    
    oriPriceNVDA = [] # List to store price
    oriShareNVDA = [] # List to store number of shares
    timeList = [] # List to store time stamp
    
    # Append data to each of the lists
    for i in range(oriTradeStartNVDA.getN()):
        oriPriceNVDA.append(oriTradeStartNVDA.getPrice(i))
        oriShareNVDA.append(oriTradeStartNVDA.getSize(i))
        timeList.append(1189396800000 + oriTradeStartNVDA.getTimestamp(i))
    
        
    for i in range(oriTradeEndNVDA.getN()):
        oriPriceNVDA.append(oriTradeEndNVDA.getPrice(i))
        oriShareNVDA.append(oriTradeEndNVDA.getSize(i))
        timeList.append(1189483200000 + oriTradeEndNVDA.getTimestamp(i))
    
    # convert data format to datetime
    dateTimeList = []
    for timeStamp in timeList:
        dateTimeList.append(datetime.fromtimestamp(timeStamp/1000))
    
    adjTradeStartNVDA = BinReader(os.path.join(os.getcwd(), 'adjusted_trades/NVDA_trades.gz'), '>QIIf', 100)
    ts0, _, size0, p0 = adjTradeStartNVDA.next()
    
    ts = []
    ts.append(ts0)
    share = []
    share.append(size0)
    price = []
    price.append(p0)
    
    while adjTradeStartNVDA.hasNext():
        t, i, s, p = adjTradeStartNVDA.next()
        ts.append(t)
        share.append(s)
        price.append(p)
    
    sIdx = binarySearch(ts, 0, len(ts)-1, timeList[0]) # find the start index
    eIdx = binarySearch(ts, 0, len(ts)-1, timeList[-1])  # find the end index
    
    pList = price[sIdx:eIdx+1]
    sList = share[sIdx:eIdx+1]
    
    adjTradeStartNVDA.close() # close bin reader
      
    plt.figure(figsize=(9,6))
    ax = plt.gca()
    xfmt = mdates.DateFormatter('%m-%d %H:%M')
    ax.xaxis.set_major_formatter(xfmt)
    plt.plot(dateTimeList[:-5], oriPriceNVDA[:-5], ls = 'None', marker = 'o', ms = 5, color = 'y', label = 'original')
    plt.plot(dateTimeList[:-5], pList, ls = 'None', marker = 'x', ms = 2 , color = 'black', label = 'adjusted')
    plt.xticks([datetime.strptime('2007-09-10 10:00:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-10 15:00:00', '%Y-%m-%d %H:%M:%S'),
                datetime.strptime('2007-09-11 09:30:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-11 15:00:00', '%Y-%m-%d %H:%M:%S')])
    plt.title('Original NVDA Trade Price VS Adjusted NVDA Trade Price')
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.legend()
    plt.show()
     
    plt.figure(figsize=(9,6))
    ax = plt.gca()
    xfmt = mdates.DateFormatter('%m-%d %H:%M')
    ax.xaxis.set_major_formatter(xfmt)
    plt.plot(dateTimeList[:-5], oriShareNVDA[:-5], ls = 'None', marker = 'o', ms = 5, color = 'y', label = 'original')
    plt.plot(dateTimeList[:-5], sList, ls = 'None', marker = 'x', ms = 2 , color = 'black', label = 'adjusted')
    plt.xticks([datetime.strptime('2007-09-10 10:00:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-10 15:00:00', '%Y-%m-%d %H:%M:%S'),
                datetime.strptime('2007-09-11 09:30:00', '%Y-%m-%d %H:%M:%S'), 
                datetime.strptime('2007-09-11 15:00:00', '%Y-%m-%d %H:%M:%S')])
    plt.ylim(0,10000)
    plt.title('Original NVDA Trade Share VS Adjusted NVDA Trade Share')
    plt.xlabel('Time')
    plt.ylabel('Number of Shares')
    plt.legend()
    plt.show()
Esempio n. 9
0
    def clean_trades(self, tick, subset=False, save=False):
        '''
        Clean the TAQ trades data
        
        Parameters
        ------
        tick : str
            The ticker of the data to be cleaned
            
        subset : boolean
            If True, only clean a subset of the data
        
        save : boolean
            If True, save the data into binary files to the certain directory
            
        Returns
        ------
        A tuple of number of the data, time stamp, clean price
        '''
        # return a tuple of lists of clean trades data
        # if parameter save = True, save the data into file
        print("Cleaning trade", tick)
        # set up the writer format
        self._tReader = BinReader(
            os.path.join(self._source, tick + "_trades.gz"), '>QIIf', 100)
        prices = CircularArray(self._k)

        if save:
            # set up the writer
            tw = self.writer('>QIIf')
            out = gzip.open(
                os.path.join(self._tradeSaveDirect, tick + "_trades.gz"), "ab")
        else:
            clTs = []
            clIds = []
            clSs = []
            clPs = []

        ts = []
        ids = []
        ss = []
        ps = []

        count = 0
        # get the data into lists
        while self._tReader.hasNext():
            t, i, s, p = self._tReader.next()
            ts.append(t)
            ids.append(i)
            ss.append(s)
            ps.append(p)
            count += 1
        print(tick, "has ", count, "entries")

        # subset the data
        if subset:
            stop = int(count / 65)
        else:
            stop = count
        tot_mean = np.mean(ps)

        screen_tracker = 0
        threshold_tracker = 0

        # initialize the filter
        while threshold_tracker < self._k:
            prices.add(ps[threshold_tracker])
            threshold_tracker += 1

        # get the current mean in the window
        pm = prices.mean()
        # get the std in the window
        std = prices.std()

        # screen the first k/2 data
        while screen_tracker < int(self._k / 2) + 1:
            cp = ps[screen_tracker]
            # if threshold return True, keep the data
            # otherwise don't take the data
            if self.threshold(cp, pm, std, tot_mean):
                if save:
                    out.write(
                        tw.pack(ts[screen_tracker], ids[screen_tracker],
                                ss[screen_tracker], ps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clSs.append(ss[screen_tracker])
                    clPs.append(ps[screen_tracker])

            screen_tracker += 1

        # screen the rest of the data
        while screen_tracker < stop:

            cp = ps[screen_tracker]

            if threshold_tracker < count:
                prices.add(cp)

                pm = prices.mean()
                std = prices.std()

            if self.threshold(cp, pm, std, tot_mean):
                if save:
                    out.write(
                        tw.pack(ts[screen_tracker], ids[screen_tracker],
                                ss[screen_tracker], ps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clSs.append(ss[screen_tracker])
                    clPs.append(ps[screen_tracker])

            screen_tracker += 1
            threshold_tracker += 1

        if save:
            out.close()
        else:
            return (count, clTs, clPs)
Esempio n. 10
0
    def clean_quotes(self, tick, subset=False, save=False):
        print("Cleaning quote ", tick)
        self._qReader = BinReader(
            os.path.join(self._source, tick + "_quotes.gz"), '>QIIfIf', 100)
        mPrices = CircularArray(self._k)
        if save:
            qw = self.writer('>QIIfIf')
            out = gzip.open(
                os.path.join(self._quoteSaveDirect, tick + "_quotes.gz"), "ab")
        else:
            clTs = []
            clIds = []
            clAss = []
            clAps = []
            clBss = []
            clBps = []

        ts = []
        ids = []
        ass = []
        aps = []
        bss = []
        bps = []
        count = 0
        while self._qReader.hasNext():
            t, i, asi, ap, bs, bp = self._qReader.next()
            ts.append(t)
            ids.append(i)
            ass.append(asi)
            aps.append(ap)
            bss.append(bs)
            bps.append(bp)
            count += 1
        print(tick, "has ", count, "entries")
        if subset:
            stop = int(count / 65)
        else:
            stop = count

        amean = np.mean(aps)
        bmean = np.mean(bps)
        mid_mean = (amean + bmean) / 2

        screen_tracker = 0
        threshold_tracker = 0
        while threshold_tracker < self._k:
            mPrices.add((aps[threshold_tracker] + bps[threshold_tracker]) / 2)
            threshold_tracker += 1

        mpm = mPrices.mean()
        mstd = mPrices.std()

        while screen_tracker < int(self._k / 2) + 1:
            mcp = (aps[screen_tracker] + bps[screen_tracker]) / 2
            if self.threshold(mcp, mpm, mstd, mid_mean):
                if save:
                    out.write(
                        qw.pack(ts[screen_tracker], ids[screen_tracker],
                                ass[screen_tracker], aps[screen_tracker],
                                bss[screen_tracker], bps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clAss.append(ass[screen_tracker])
                    clAps.append(aps[screen_tracker])
                    clBss.append(bss[screen_tracker])
                    clBps.append(bps[screen_tracker])
            screen_tracker += 1

        while screen_tracker < stop:
            mcp = (aps[screen_tracker] + bps[screen_tracker]) / 2

            if threshold_tracker < count:
                mPrices.add(mcp)
                mpm = mPrices.mean()
                mstd = mPrices.std()
                threshold_tracker += 1
            if self.threshold(mcp, mpm, mstd, mid_mean):
                if save:
                    out.write(
                        qw.pack(ts[screen_tracker], ids[screen_tracker],
                                ass[screen_tracker], aps[screen_tracker],
                                bss[screen_tracker], bps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clAss.append(ass[screen_tracker])
                    clAps.append(aps[screen_tracker])
                    clBss.append(bss[screen_tracker])
                    clBps.append(bps[screen_tracker])

            screen_tracker += 1

        if save:
            out.close()
        else:
            return (count, clTs, clAps, clBps)
Esempio n. 11
0
class CleanTAQ(object):
    '''
    A class that clean the adjusted TAQ data
    '''
    def __init__(self, fileDirectory, k=5, gCoeff=0.0005):
        '''
        Parameters
        ------
        fileDirectory : str
            Path to the sources of the adjusted TAQ data
        
        k : int
            Windows parameter for filtering model
            
        gCoeff : float
            Gamma coefficient parameter for the filtering model
            
        '''
        self._k = k
        self._gCoeff = gCoeff
        self._source = fileDirectory

        # set the save directory
        self._direct = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
        self._tradeSaveDirect = os.path.join(self._direct, "clean_trades")
        self._quoteSaveDirect = os.path.join(self._direct, "clean_quotes")
        # if the directories does not exist, create
        if not os.path.isdir(self._tradeSaveDirect):
            os.mkdir(self._tradeSaveDirect)
        if not os.path.isdir(self._quoteSaveDirect):
            os.mkdir(self._quoteSaveDirect)

    def threshold(self, p, mean, std, tot_mean):
        '''
        Determine if the data stay or being remove
        
        Parameters
        ------
        p : float
            current data price
            
        mean : float
            current window mean
            
        std : float
            current window std
            
        tot_mean : float 
            mean of all of the data
            
        Returns
        ------
          : boolean
              True if the data stays, False if the data goes
        '''
        upperbound = 2 * std + self._gCoeff * tot_mean

        if np.abs(p - mean) < upperbound:
            return True
        else:
            return False

    def writer(self, fmt):
        s = struct.Struct(fmt)
        return s

    def clean_trades(self, tick, subset=False, save=False):
        '''
        Clean the TAQ trades data
        
        Parameters
        ------
        tick : str
            The ticker of the data to be cleaned
            
        subset : boolean
            If True, only clean a subset of the data
        
        save : boolean
            If True, save the data into binary files to the certain directory
            
        Returns
        ------
        A tuple of number of the data, time stamp, clean price
        '''
        # return a tuple of lists of clean trades data
        # if parameter save = True, save the data into file
        print("Cleaning trade", tick)
        # set up the writer format
        self._tReader = BinReader(
            os.path.join(self._source, tick + "_trades.gz"), '>QIIf', 100)
        prices = CircularArray(self._k)

        if save:
            # set up the writer
            tw = self.writer('>QIIf')
            out = gzip.open(
                os.path.join(self._tradeSaveDirect, tick + "_trades.gz"), "ab")
        else:
            clTs = []
            clIds = []
            clSs = []
            clPs = []

        ts = []
        ids = []
        ss = []
        ps = []

        count = 0
        # get the data into lists
        while self._tReader.hasNext():
            t, i, s, p = self._tReader.next()
            ts.append(t)
            ids.append(i)
            ss.append(s)
            ps.append(p)
            count += 1
        print(tick, "has ", count, "entries")

        # subset the data
        if subset:
            stop = int(count / 65)
        else:
            stop = count
        tot_mean = np.mean(ps)

        screen_tracker = 0
        threshold_tracker = 0

        # initialize the filter
        while threshold_tracker < self._k:
            prices.add(ps[threshold_tracker])
            threshold_tracker += 1

        # get the current mean in the window
        pm = prices.mean()
        # get the std in the window
        std = prices.std()

        # screen the first k/2 data
        while screen_tracker < int(self._k / 2) + 1:
            cp = ps[screen_tracker]
            # if threshold return True, keep the data
            # otherwise don't take the data
            if self.threshold(cp, pm, std, tot_mean):
                if save:
                    out.write(
                        tw.pack(ts[screen_tracker], ids[screen_tracker],
                                ss[screen_tracker], ps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clSs.append(ss[screen_tracker])
                    clPs.append(ps[screen_tracker])

            screen_tracker += 1

        # screen the rest of the data
        while screen_tracker < stop:

            cp = ps[screen_tracker]

            if threshold_tracker < count:
                prices.add(cp)

                pm = prices.mean()
                std = prices.std()

            if self.threshold(cp, pm, std, tot_mean):
                if save:
                    out.write(
                        tw.pack(ts[screen_tracker], ids[screen_tracker],
                                ss[screen_tracker], ps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clSs.append(ss[screen_tracker])
                    clPs.append(ps[screen_tracker])

            screen_tracker += 1
            threshold_tracker += 1

        if save:
            out.close()
        else:
            return (count, clTs, clPs)

    def clean_quotes(self, tick, subset=False, save=False):
        print("Cleaning quote ", tick)
        self._qReader = BinReader(
            os.path.join(self._source, tick + "_quotes.gz"), '>QIIfIf', 100)
        mPrices = CircularArray(self._k)
        if save:
            qw = self.writer('>QIIfIf')
            out = gzip.open(
                os.path.join(self._quoteSaveDirect, tick + "_quotes.gz"), "ab")
        else:
            clTs = []
            clIds = []
            clAss = []
            clAps = []
            clBss = []
            clBps = []

        ts = []
        ids = []
        ass = []
        aps = []
        bss = []
        bps = []
        count = 0
        while self._qReader.hasNext():
            t, i, asi, ap, bs, bp = self._qReader.next()
            ts.append(t)
            ids.append(i)
            ass.append(asi)
            aps.append(ap)
            bss.append(bs)
            bps.append(bp)
            count += 1
        print(tick, "has ", count, "entries")
        if subset:
            stop = int(count / 65)
        else:
            stop = count

        amean = np.mean(aps)
        bmean = np.mean(bps)
        mid_mean = (amean + bmean) / 2

        screen_tracker = 0
        threshold_tracker = 0
        while threshold_tracker < self._k:
            mPrices.add((aps[threshold_tracker] + bps[threshold_tracker]) / 2)
            threshold_tracker += 1

        mpm = mPrices.mean()
        mstd = mPrices.std()

        while screen_tracker < int(self._k / 2) + 1:
            mcp = (aps[screen_tracker] + bps[screen_tracker]) / 2
            if self.threshold(mcp, mpm, mstd, mid_mean):
                if save:
                    out.write(
                        qw.pack(ts[screen_tracker], ids[screen_tracker],
                                ass[screen_tracker], aps[screen_tracker],
                                bss[screen_tracker], bps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clAss.append(ass[screen_tracker])
                    clAps.append(aps[screen_tracker])
                    clBss.append(bss[screen_tracker])
                    clBps.append(bps[screen_tracker])
            screen_tracker += 1

        while screen_tracker < stop:
            mcp = (aps[screen_tracker] + bps[screen_tracker]) / 2

            if threshold_tracker < count:
                mPrices.add(mcp)
                mpm = mPrices.mean()
                mstd = mPrices.std()
                threshold_tracker += 1
            if self.threshold(mcp, mpm, mstd, mid_mean):
                if save:
                    out.write(
                        qw.pack(ts[screen_tracker], ids[screen_tracker],
                                ass[screen_tracker], aps[screen_tracker],
                                bss[screen_tracker], bps[screen_tracker]))
                else:
                    clTs.append(ts[screen_tracker])
                    clIds.append(ids[screen_tracker])
                    clAss.append(ass[screen_tracker])
                    clAps.append(aps[screen_tracker])
                    clBss.append(bss[screen_tracker])
                    clBps.append(bps[screen_tracker])

            screen_tracker += 1

        if save:
            out.close()
        else:
            return (count, clTs, clAps, clBps)
Created on Feb 24, 2020

@author: natehuang
'''
from dbReaders.BinReader import BinReader
import matplotlib.pyplot as plt
import os
from datetime import datetime

if __name__ == '__main__':

    # change tick1 for another stock
    tick1 = "RRC"
    filePath = os.path.join(os.getcwd(), "adjusted_trades",
                            tick1 + "_trades.gz")
    tr = BinReader(filePath, '>QIIf', 100)
    ts1 = []
    ps1 = []
    while tr.hasNext():
        now = tr.next()
        ts1.append(datetime.fromtimestamp(now[0] / 1000))
        ps1.append(now[3])

    filePath = os.path.join(
        os.path.abspath(os.path.join(os.getcwd(), os.pardir)), "clean_trades",
        tick1 + "_trades.gz")
    tr = BinReader(filePath, '>QIIf', 100)
    ts2 = []
    ps2 = []
    while tr.hasNext():
        now = tr.next()
Esempio n. 13
0
    def tradeReturnStat(self):

        tradeData = BinReader(self._path, '>QIIf', 100)

        day0 = '20070620'  # start day
        day1 = '20070621'  # one day interval
        dayN = datetime.strptime('20070921', '%Y%m%d')
        oneDay = pd.Timedelta(1, unit="d")
        days = 65  # total number of days

        startDate = datetime.strptime(day0, '%Y%m%d')
        endDate = datetime.strptime(day1, '%Y%m%d')

        tradeReturn = []
        ts = []  # List to store time stamp
        pr = []  # List to store price

        while tradeData.hasNext():
            t, _, _, p = tradeData.next()  # Time stamp
            ts.append(t)
            pr.append(p)

        for i in range(len(ts)):
            t = datetime.fromtimestamp(
                ts[i] / 1000)  # Convert milliseconds to DateTime
            ts[i] = t

        tradeData.close()

        startIdx = 0  # start index
        endIdx = 0  # end index

        for i in range(days):
            startFlag = False
            endFlag = False

            # Calculate indexes
            for time in ts:
                if time > startDate:  # start index stops
                    startFlag = True
                if startFlag == False:
                    startIdx += 1
                if time > endDate:  # end index stops and exits the loop
                    endFlag = True
                    break
                if endFlag == False:
                    endIdx += 1

            prPerChange = (pr[endIdx - 1] -
                           pr[startIdx]) / pr[startIdx]  # daily return
            if not prPerChange == 0:  # No transaction on that day
                tradeReturn.append(prPerChange)

            if startDate < dayN:  # Move to the next day
                startDate = startDate + oneDay
                endDate = endDate + oneDay

            if startDate.weekday(
            ) >= 5:  # If the day is Saturday, move it to Monday
                startDate = startDate + oneDay + oneDay
                endDate = endDate + oneDay + oneDay

            # Labor day and Independent's day are holidays!
            if startDate == datetime.strptime(
                    '20070903', '%Y%m%d') or startDate == datetime.strptime(
                        '20070704', '%Y%m%d'):
                startDate = startDate + oneDay
                endDate = endDate + oneDay

            startIdx = 0
            endIdx = 0

        # Calculate a bunch of statistics
        meanReturn = np.mean(tradeReturn)  # Mean
        medianReturn = np.median(tradeReturn)  # Median
        stdReturn = np.std(tradeReturn)  # Standard deviation
        mad = []
        for i in range(len(tradeReturn)):
            mad.append(abs(tradeReturn[i] - meanReturn))
        MAD = np.median(mad)  # Median absolute deviation

        annualilzedMeanReturn = (
            meanReturn + 1)**252 - 1  # annualized return, 252 trading days
        annualilzedMedianReturn = (medianReturn + 1)**252 - 1
        annualilzedStdReturn = stdReturn * sqrt(252)
        annualizedMAD = (MAD + 1)**252 - 1
        skewReturn = skew(tradeReturn)  # Skewness
        kurtReturn = kurtosis(tradeReturn)  # Kurtosis
        a = sorted(tradeReturn, reverse=True)  # Reverse the order, descending
        tenLargest = a[0:10]
        b = sorted(tradeReturn)  # ascending
        tenSmallest = b[0:10]

        maximums = np.maximum.accumulate(pr)
        drawdowns = 1 - pr / maximums
        maxDrawDown = np.max(drawdowns)  # maximum draw down

        return annualilzedMeanReturn, annualilzedMedianReturn, annualilzedStdReturn, annualizedMAD, skewReturn, kurtReturn, tenLargest, tenSmallest, maxDrawDown
Esempio n. 14
0
    def midQuoteReturnStat(self):
        quoteData = BinReader(self._path, '>QIIfIf', 100)

        day0 = '20070620'
        day1 = '20070621'
        dayN = datetime.strptime('20070921', '%Y%m%d')
        oneDay = pd.Timedelta(1, unit="d")
        days = 65

        startDate = datetime.strptime(day0, '%Y%m%d')
        endDate = datetime.strptime(day1, '%Y%m%d')

        midReturn = []
        ts = []  # List to store time stamp
        ask = []
        bid = []

        while quoteData.hasNext():
            t, _, _, a, _, b = quoteData.next()  # Time stamp
            ts.append(t)
            ask.append(a)
            bid.append(b)

        for i in range(len(ts)):
            t = datetime.fromtimestamp(
                ts[i] / 1000)  # Convert milliseconds to DateTime
            ts[i] = t

        quoteData.close()

        mid = []  # Mid quote list
        for i in range(len(ask)):
            mid.append((ask[i] + bid[i]) / 2)

        startIdx = 0
        endIdx = 0

        for i in range(days):
            startFlag = False
            endFlag = False

            #
            for time in ts:
                if time > startDate:
                    startFlag = True
                if startFlag == False:
                    startIdx += 1
                if time > endDate:
                    endFlag = True
                    break
                if endFlag == False:
                    endIdx += 1

            prPerChange = (mid[endIdx - 1] - mid[startIdx]) / mid[startIdx]
            if not prPerChange == 0:  # No transcation on that day
                midReturn.append(prPerChange)

            if startDate < dayN:
                startDate = startDate + oneDay
                endDate = endDate + oneDay

            if startDate.weekday(
            ) >= 5:  # If the day is Saturday, move it to Monday
                startDate = startDate + oneDay + oneDay
                endDate = endDate + oneDay + oneDay

            # Labor day and Independent's day are holidays!
            if startDate == datetime.strptime(
                    '20070903', '%Y%m%d') or startDate == datetime.strptime(
                        '20070704', '%Y%m%d'):
                startDate = startDate + oneDay
                endDate = endDate + oneDay

            startIdx = 0
            endIdx = 0

        meanReturn = np.mean(midReturn)
        medianReturn = np.median(midReturn)
        stdReturn = np.std(midReturn)
        mad = []
        for i in range(len(midReturn)):
            mad.append(abs(midReturn[i] - meanReturn))
        MAD = np.median(mad)

        annualilzedMeanReturn = (
            meanReturn + 1)**252 - 1  # annualized return, 252 trading days
        annualilzedMedianReturn = (medianReturn + 1)**252 - 1
        annualilzedStdReturn = stdReturn * sqrt(252)
        annualizedMAD = (MAD + 1)**252 - 1  # median absolute deviation
        skewReturn = skew(midReturn)
        kurtReturn = kurtosis(midReturn)
        a = sorted(midReturn, reverse=True)  # descending order
        tenLargest = a[0:10]
        b = sorted(midReturn)  # ascending order
        tenSmallest = b[0:10]

        # calculate maximum drawdown
        maximums = np.maximum.accumulate(mid)
        drawdowns = 1 - mid / maximums
        maxDrawDown = np.max(drawdowns)

        return annualilzedMeanReturn, annualilzedMedianReturn, annualilzedStdReturn, annualizedMAD, skewReturn, kurtReturn, tenLargest, tenSmallest, maxDrawDown