Beispiel #1
0
def UpdateHDF5(symbol_directory, symbols_file):

    ##
    ##  Update symbols in 'symbols_file' with quotes more recent than last update.
    ##

    filename = os.path.join(symbol_directory, symbols_file)

    x, symbols, datearray, quote, listname = loadQuotes_fromHDF(filename)

    # get last date in hdf5 archive
    #from datetime import datetime
    import datetime

    date = quote.index
    lastdate = getLastDateFromHDF5(symbol_directory, symbols_file)

    ##
    ## Get quotes for each symbol in list
    ## process dates.
    ## Clean up quotes.
    ## Make a plot showing all symbols in list
    ##

    # locate symbols added to list that aren't in HDF5 file
    symbols_in_list = readSymbolList(filename, verbose=False)
    symbols_in_HDF5 = list(quote.columns.values)
    new_symbols = [x for x in symbols_in_list if x not in symbols_in_HDF5]

    # write new symbols to temporary file
    if len(new_symbols) > 0:
        # write new symbols to temporary file
        tempfilename = os.path.join(symbol_directory,
                                    "newsymbols_tempfile.txt")
        OUTFILE = open(tempfilename, "w", 0)
        for i, isymbol in enumerate(new_symbols):
            print "new symbol = ", isymbol
            OUTFILE.write(str(isymbol) + "\n")

        newquotesfirstdate = datetime.date(1991, 1, 1)
        newquoteslastdate = datetime.date.today()

        # print dates to be used
        print "dates for new symbol found = ", newquotesfirstdate, newquoteslastdate

        newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(
            tempfilename, newquotesfirstdate, newquoteslastdate)

        print " security values check: ", newadjClose[isnan(newadjClose)].shape

        newdates = []
        for i in range(newdatearray.shape[0]):
            newdates.append(str(newdatearray[i]))
        #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float)
        quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0, 1),
                                         index=newdates,
                                         columns=newsymbols)

    ##
    ## Get quotes for each symbol in list
    ## process dates.
    ## Clean up quotes.
    ## Make a plot showing all symbols in list
    ##

    if type(lastdate) == str:
        newquotesfirstdate = datetime.date(
            *[int(val) for val in lastdate.split('-')])
    else:
        newquotesfirstdate = lastdate
    today = datetime.datetime.now()
    tomorrow = today + timedelta(days=1)
    newquoteslastdate = tomorrow

    newadjClose, symbols, newdatearray = arrayFromQuotesForList(
        filename, newquotesfirstdate, newquoteslastdate)

    print " security values check: ", newadjClose[isnan(newadjClose)].shape

    newdates = []
    for i in range(newdatearray.shape[0]):
        newdates.append(str(newdatearray[i]))
    quoteupdate = pd.DataFrame(newadjClose.swapaxes(0, 1),
                               index=newdates,
                               columns=symbols)

    updatedquotes = quoteupdate.combine_first(quote)

    ###################
    from functions.TAfunctions import cleanspikes
    from functions.TAfunctions import interpolate
    from functions.TAfunctions import cleantobeginning

    # clean up quotes for missing values and varying starting date
    #x = quote.as_matrix().swapaxes(0,1)
    xupdate = updatedquotes.values.T
    symbolListupdate = list(updatedquotes.columns.values)

    # Clean up input quotes
    #  - infill interior NaN values using nearest good values to linearly interpolate
    #  - copy first valid quote to from valid date to all earlier positions
    #for ii in range(x.shape[0]):
    for ii, isymbolupdate in enumerate(symbolListupdate):
        '''
        if ii%5 == 0:
            print "  ... progress:  ii, symbol = ", ii, isymbolupdate
        '''
        xupdate = updatedquotes[isymbolupdate].values
        xupdate = cleanspikes(xupdate)
        xupdate = interpolate(xupdate)
        xupdate = cleantobeginning(xupdate)
        updatedquotes[isymbolupdate] = xupdate
    ###################

    if len(new_symbols) > 0:
        print "\n\n\n...quotes_NewSymbols = ", quotes_NewSymbols.info()
        print "\n\n\n...updatedquotes = ", updatedquotes.info()
        for isymbol in new_symbols:
            updatedquotes[isymbol] = quotes_NewSymbols[isymbol]
        print "\n\n\n...merged updatedquotes = ", updatedquotes.info()

    CASHadjClose = np.ones((len(updatedquotes.index)), float) * 100.
    for i in range(CASHadjClose.shape[0]):
        if i % 10 == 0:
            CASHadjClose[i] = CASHadjClose[i - 1] + .01
        else:
            CASHadjClose[i] = CASHadjClose[i - 1]

    updatedquotes['CASH'] = CASHadjClose

    # set up to write quotes to disk.
    dirname = os.path.join(os.getcwd(), "symbols")

    hdf5filename = os.path.join(dirname, listname + "_.hdf5")
    print "hdf5 filename = ", hdf5filename
    updatedquotes.to_hdf(hdf5filename,
                         listname,
                         mode='a',
                         format='table',
                         append=False,
                         complevel=5,
                         complib='blosc')

    return
Beispiel #2
0
def cleanup_quotes(symbols_file, newquotesfirstdate, newquoteslastdate):
    # compare quotes currently on hdf with updated quotes from internet.
    print " ...   inside compareHDF_and_newquotes   ..."
    print " ... newquotesfirstdate = ", newquotesfirstdate
    print " ... newquoteslastdate = ", newquoteslastdate

    # get existing quotes from hdf
    (directory_name, file_name) = os.path.split(symbols_file)
    (shortname, extension) = os.path.splitext(file_name)

    print "file name for symbols = ", "_" + shortname + "_"
    print "file type for symbols = ", extension

    # set up to write quotes to disk.

    if shortname == "symbols":
        listname = "TAA-Symbols"
    elif shortname == "cmg_symbols":
        listname = "CMG-Symbols"
    elif shortname == "Naz100_Symbols":
        listname = "Naz100_Symbols"
    elif shortname == "biglist":
        listname = "biglist-Symbols"
    elif shortname == "ETF_symbols":
        listname = "ETF-Symbols"
    elif shortname == "ProvidentFundSymbols":
        listname = "ProvidentFund-Symbols"
    elif shortname == "sp500_symbols":
        listname = "SP500-Symbols"
    else:
        listname = shortname

    hdf5_directory = os.path.join(os.getcwd(), "symbols")
    hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5")

    print ""
    print ""
    print "symbol_directory = ", directory_name
    print "symbols_file = ", symbols_file
    print "shortname, extension = ", shortname, extension
    print "hdf5filename = ", hdf5filename

    dataframeFromHDF = pd.read_hdf(hdf5filename, listname)
    x_hdf = dataframeFromHDF.as_matrix()
    x_hdf = x_hdf.swapaxes(0, 1)
    date_hdf = dataframeFromHDF.index
    symbols_hdf = list(dataframeFromHDF.columns.values)

    # Clean up input quotes
    #  - infill interior NaN values using nearest good values to linearly interpolate
    #  - copy first valid quote to from valid date to all earlier positions
    #for ii in range(x.shape[0]):
    for ii, isymbolupdate in enumerate(symbols_hdf):
        xupdate = dataframeFromHDF[isymbolupdate]
        '''
        if isymbolupdate == 'SBUX':
            import pdb
            pdb.set_trace()
        '''
        print " ... cleanup_quotes ... symbol = ", isymbolupdate
        xupdate = cleanspikes(xupdate)
        xupdate = interpolate(xupdate)
        xupdate = cleantobeginning(xupdate)
        dataframeFromHDF[isymbolupdate] = xupdate.copy()
        #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float')
        #xupdate[ii,:] = interpolate(xupdate[ii,:])
        #xupdate[ii,:] = cleantobeginning(xupdate[ii,:])

    dataframeFromHDF.to_hdf(hdf5filename,
                            listname,
                            mode='a',
                            format='table',
                            append=False,
                            complevel=5,
                            complib='blosc')

    return
Beispiel #3
0
def compareHDF_and_newquotes(symbols_file, newquotesfirstdate,
                             newquoteslastdate):
    # compare quotes currently on hdf with updated quotes from internet.
    print " ...   inside compareHDF_and_newquotes   ..."
    print " ... newquotesfirstdate = ", newquotesfirstdate
    print " ... newquoteslastdate = ", newquoteslastdate

    # get existing quotes from hdf
    (directory_name, file_name) = os.path.split(symbols_file)
    (shortname, extension) = os.path.splitext(file_name)

    print "file name for symbols = ", "_" + shortname + "_"
    print "file type for symbols = ", extension

    # set up to write quotes to disk.

    if shortname == "symbols":
        listname = "TAA-Symbols"
    elif shortname == "cmg_symbols":
        listname = "CMG-Symbols"
    elif shortname == "Naz100_Symbols":
        listname = "Naz100_Symbols"
    elif shortname == "biglist":
        listname = "biglist-Symbols"
    elif shortname == "ETF_symbols":
        listname = "ETF-Symbols"
    elif shortname == "ProvidentFundSymbols":
        listname = "ProvidentFund-Symbols"
    elif shortname == "sp500_symbols":
        listname = "SP500-Symbols"
    else:
        listname = shortname

    hdf5_directory = os.path.join(os.getcwd(), "symbols")
    hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5")

    print ""
    print ""
    print "symbol_directory = ", directory_name
    print "symbols_file = ", symbols_file
    print "shortname, extension = ", shortname, extension
    print "hdf5filename = ", hdf5filename

    dataframeFromHDF = pd.read_hdf(hdf5filename, listname)
    x_hdf = dataframeFromHDF.as_matrix()
    x_hdf = x_hdf.swapaxes(0, 1)
    date_hdf = dataframeFromHDF.index
    symbols_hdf = list(dataframeFromHDF.columns.values)

    # get new quotes dataframe from internet
    newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(
        symbols_file, newquotesfirstdate, newquoteslastdate)
    print " security values check: ", newadjClose[isnan(newadjClose)].shape
    newdates = []
    for i in range(newdatearray.shape[0]):
        newdates.append(str(newdatearray[i]))
    #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float)
    dataframeFromInternet = pd.DataFrame(newadjClose.swapaxes(0, 1),
                                         index=newdates,
                                         columns=newsymbols)

    ###################
    from functions.TAfunctions import interpolate
    from functions.TAfunctions import cleantobeginning

    # clean up quotes for missing values and varying starting date
    #x = quote.as_matrix().swapaxes(0,1)
    ##xupdate = dataframeFromInternet.values.T
    symbolListupdate = list(dataframeFromInternet.columns.values)

    # Clean up input quotes
    #  - infill interior NaN values using nearest good values to linearly interpolate
    #  - copy first valid quote to from valid date to all earlier positions
    #for ii in range(x.shape[0]):
    for ii, isymbolupdate in enumerate(symbolListupdate):
        xupdate = dataframeFromInternet[isymbolupdate]
        '''
        if isymbolupdate == 'SBUX':
            import pdb
            pdb.set_trace()
        '''
        print " isymbolupdate,xupdate = ", isymbolupdate, xupdate.as_matrix()
        xupdate = cleanspikes(xupdate)
        xupdate = interpolate(xupdate)
        xupdate = cleantobeginning(xupdate)
        #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float')
        #xupdate[ii,:] = interpolate(xupdate[ii,:])
        #xupdate[ii,:] = cleantobeginning(xupdate[ii,:])
    ###################

    x_net = dataframeFromInternet.as_matrix()
    x_net = x_net.swapaxes(0, 1)
    date_net = dataframeFromInternet.index
    symbols_net = list(dataframeFromInternet.columns.values)

    # find joined symbols
    symbols_all = symbols_hdf + symbols_net
    symbols_all = list(set(symbols_all))
    symbols_all.sort()

    for isymbol in symbols_all:
        # find date range for shorter of quotes update from net or on hdf
        try:
            hdf_index = symbols_hdf.index(isymbol)
            firstindexup_hdf = np.argmax(
                np.clip(x_hdf[hdf_index, :] / x_hdf[hdf_index, 0], 1.,
                        1. + 1.e-5))
            firstindexdown_hdf = np.argmin(
                np.clip(x_hdf[hdf_index, :] / x_hdf[hdf_index, 0], 1. - 1.e-5,
                        1.))
            firstindex_hdf = max(firstindexup_hdf, firstindexdown_hdf)

            net_index = symbols_net.index(isymbol)
            firstindexup_net = np.argmax(
                np.clip(x_net[net_index, :] / x_net[net_index, 0], 1.,
                        1. + 1.e-5))
            firstindexdown_net = np.argmin(
                np.clip(x_net[net_index, :] / x_net[net_index, 0], 1. - 1.e-5,
                        1.))
            firstindex_net = max(firstindexup_net, firstindexdown_net)
            firstDate = max(date_net[firstindex_net], date_hdf[firstindex_hdf])
            lastDate = min(date_net[-1], date_hdf[-1])

            values_hdf = x_hdf[hdf_index,
                               list(date_hdf).index(firstDate):list(date_hdf).
                               index(lastDate) + 1]
            values_net = x_net[net_index,
                               list(date_net).index(firstDate):list(date_net).
                               index(lastDate) + 1]

            if False in values_hdf == values_net:
                print " ... **** symbol ", format(
                    isymbol, '5s'
                ), " is different in hdf and update from internet (", firstDate, " to ", lastDate, " )"
            else:
                print " ... symbol ", format(
                    isymbol, '5s'
                ), " is same in hdf and update from internet (", firstDate, " to ", lastDate, " )"
        except:
            print " ... **** **** symbol ", format(
                isymbol, '5s'), " not matched in hdf and update from internet"
        '''
        if isymbol == 'SBUX':
            print " .... firstdate, lastdate = ", firstDate, lastDate
            datesForPlot = date_hdf[list(date_hdf).index(firstDate):list(date_hdf).index(lastDate)+1]
            _datesForPlot1=[]
            for i in range(len(datesForPlot)):
                datestr = datesForPlot[i]
                date_newformat = datetime.date(*[int(val) for val in datestr.split('-')])
                #date_newformat = datestr
                _datesForPlot1.append(date_newformat)
                iindex = list(date_hdf).index(firstDate) + i
                #print "i,date_newformat,values_net['SBUX'] = ", i,date_newformat,x_net[net_index,iindex]

            print " .... _datesForPlot1 = ", _datesForPlot1
            plt.figure()
            plt.grid()
            plt.plot(_datesForPlot1,values_hdf)
            plt.plot(_datesForPlot1,values_hdf,'b.')

            datesForPlot = date_net[list(date_net).index(firstDate):list(date_net).index(lastDate)+1]
            _datesForPlot2=[]
            for i in range(len(datesForPlot)):
                datestr = datesForPlot[i]
                date_newformat = datetime.date(*[int(val) for val in datestr.split('-')])
                #date_newformat = datestr
                _datesForPlot2.append(date_newformat)
            print "\n\n\n .... _datesForPlot2 = ", _datesForPlot2
            plt.plot(_datesForPlot2,values_net)
            plt.plot(_datesForPlot2,values_net,'g.')
        '''

    return
Beispiel #4
0
def UpdateHDF_yf(symbol_directory, symbols_file):

    ##
    ##  Update symbols in 'symbols_file' with quotes more recent than last update.
    ##  - use yahoo_fix for pandas_datareader
    ##

    print("  ... inside UpdateHDF_yf ...")

    filename = os.path.join(symbol_directory, symbols_file)

    x, symbols, datearray, quote, listname = loadQuotes_fromHDF(filename)
    print("  ... inside UpdateHDF_yf ... finished loadQuotes_fromHDF")

    def _return_quotes_array(symbolsFile,
                             start_date="2018-01-01",
                             end_date=None):
        ###
        ### get quotes from yahoo_fix. return quotes, symbols, dates
        ### as numpy arrays
        ###
        import datetime
        from functions.readSymbols import readSymbolList
        from pandas_datareader import data as pdr
        import functions.fix_yahoo_finance as yf
        yf.pdr_override()  # <== that's all it takes :-)

        # read symbols list
        symbols = readSymbolList(symbolsFile, verbose=True)

        if end_date == None:
            end_date = str(datetime.date.today())

        #data = pdr.get_data_yahoo(symbols, start=start_date, end=end_date)
        data = get_quotes_yf(symbols, start_date=start_date, end_date=end_date)
        try:
            # for multiple symbols
            #symbolList = data['Adj Close'].columns
            symbolList = list(data.columns)
        except:
            # for single symbol
            symbolList = symbols
        #datearray = data['Adj Close'].index
        #x = data['Adj Close'].values
        datearray = data.index
        x = data.values
        newdates = []
        for i in range(datearray.shape[0]):
            newdates.append(str(datearray[i]).split(' ')[0])
        newdates = np.array(newdates)

        return x, symbolList, newdates

    # get last date in hdf5 archive
    #from datetime import datetime
    import datetime

    date = quote.index
    lastdate = getLastDateFromHDF5(symbol_directory, symbols_file)
    print(" ... inside UpdateHDF5 ... lastdate = ", lastdate)
    from time import sleep
    sleep(3)

    ##
    ## Get quotes for each symbol in list
    ## process dates.
    ## Clean up quotes.
    ## Make a plot showing all symbols in list
    ##

    # locate symbols added to list that aren't in HDF5 file
    symbols_in_list = readSymbolList(filename, verbose=False)
    symbols_in_HDF5 = list(quote.columns.values)
    new_symbols = [x for x in symbols_in_list if x not in symbols_in_HDF5]

    # write new symbols to temporary file
    if len(new_symbols) > 0:
        # write new symbols to temporary file
        tempfilename = os.path.join(symbol_directory,
                                    "newsymbols_tempfile.txt")
        OUTFILE = open(tempfilename, "w")
        for i, isymbol in enumerate(new_symbols):
            print("new symbol = ", isymbol)
            OUTFILE.write(str(isymbol) + "\n")
        OUTFILE.close()

        newquotesfirstdate = datetime.date(1991, 1, 1)
        newquoteslastdate = datetime.date.today()

        # print dates to be used
        print("dates for new symbol found = ", newquotesfirstdate,
              newquoteslastdate)
        print("newquotesfirstdate, newquoteslastdate = ", newquotesfirstdate,
              newquoteslastdate)

        #newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(tempfilename, newquotesfirstdate, newquoteslastdate)
        newadjClose, newsymbols, newdatearray = _return_quotes_array(
            tempfilename,
            start_date=newquotesfirstdate,
            end_date=newquoteslastdate)

        if type(newdatearray) == list:
            newdatearray = np.array(newdatearray)
        print(" newadjClose.shape = ", newadjClose.shape)
        print(" len(newsymbols) = ", len(newsymbols))
        print(" len(newdatearray) = ", len(newdatearray))
        print(" security values check: ",
              newadjClose[np.isnan(newadjClose)].shape)

        newdates = []
        for i in range(newdatearray.shape[0]):
            newdates.append(str(newdatearray[i]))
        #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float)
        print("newadjClose.shape = ", newadjClose.shape)
        print('newsymbols = ', newsymbols)
        print('newdatearray = ', newdatearray)
        if newadjClose.shape[1] == len(newdates):
            quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0, 1),
                                             index=newdates,
                                             columns=newsymbols)
        else:
            quotes_NewSymbols = pd.DataFrame(newadjClose,
                                             index=newdates,
                                             columns=newsymbols)
        """
        if newadjClose.ndim > 1:
            quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0,1), index=newdates, columns=newsymbols)
        else:
            quotes_NewSymbols = pd.DataFrame(newadjClose, index=newdates, columns=newsymbols)
        """

    ##
    ## Get quotes for each symbol in list
    ## process dates.
    ## Clean up quotes.
    ## Make a plot showing all symbols in list
    ##

    if type(lastdate) == str:
        newquotesfirstdate = datetime.date(
            *[int(val) for val in lastdate.split('-')])
    else:
        newquotesfirstdate = lastdate
    today = datetime.datetime.now()
    tomorrow = today + timedelta(days=1)
    newquoteslastdate = tomorrow

    #newadjClose, symbols, newdatearray = arrayFromQuotesForList(filename, newquotesfirstdate, newquoteslastdate)
    newadjClose, symbols, newdatearray = _return_quotes_array(
        filename, start_date=newquotesfirstdate, end_date=newquoteslastdate)

    print(" ...inside UpdateSymbols_inHDF5... newadjClose.shape =  ",
          newadjClose.shape)
    print(" ...inside UpdateSymbols_inHDF5... len(symbols) =  ", len(symbols))
    print(" ...inside UpdateSymbols_inHDF5...    quote.shape =  ", quote.shape)

    newdates = []
    for i in range(len(newdatearray)):
        newdates.append(str(newdatearray[i]))
    #quoteupdate = pd.DataFrame( newadjClose.swapaxes(0,1), index=newdates, columns=symbols)
    quoteupdate = pd.DataFrame(newadjClose, index=newdates, columns=symbols)

    updatedquotes = quoteupdate.combine_first(quote)

    ###################
    from functions.TAfunctions import cleanspikes
    from functions.TAfunctions import interpolate
    from functions.TAfunctions import cleantobeginning

    # clean up quotes for missing values and varying starting date
    #x = quote.as_matrix().swapaxes(0,1)
    xupdate = updatedquotes.values.T
    symbolListupdate = list(updatedquotes.columns.values)

    # Clean up input quotes
    #  - infill interior NaN values using nearest good values to linearly interpolate
    #  - copy first valid quote to from valid date to all earlier positions
    #for ii in range(x.shape[0]):
    for ii, isymbolupdate in enumerate(symbolListupdate):
        '''
        if ii%5 == 0:
            print "  ... progress:  ii, symbol = ", ii, isymbolupdate
        '''
        #print("  ... progress:  ii, symbol = ", ii, isymbolupdate)
        xupdate = updatedquotes[isymbolupdate].values
        print("  ... progress:  ii, symbol, # nans = ", ii, isymbolupdate,
              xupdate[~np.isnan(xupdate)].shape)
        xupdate = cleanspikes(xupdate)
        xupdate = cleantobeginning(xupdate)
        xupdate = cleantoend(xupdate)
        xupdate = interpolate(xupdate, verbose=True)
        xupdate = cleantobeginning(xupdate)
        updatedquotes[isymbolupdate] = xupdate
    ###################

    if len(new_symbols) > 0:
        print("\n\n\n...quotes_NewSymbols = ", quotes_NewSymbols.info())
        print("\n\n\n...updatedquotes = ", updatedquotes.info())
        for isymbol in new_symbols:
            updatedquotes[isymbol] = quotes_NewSymbols[isymbol]
        print("\n\n\n...merged updatedquotes = ", updatedquotes.info())

    CASHadjClose = np.ones((len(updatedquotes.index)), float) * 100000.
    for i in range(CASHadjClose.shape[0]):
        if i % 10 == 0:
            CASHadjClose[i] = CASHadjClose[i - 1] + .01
        else:
            CASHadjClose[i] = CASHadjClose[i - 1]

    updatedquotes['CASH'] = CASHadjClose / 100000.

    # set up to write quotes to disk.
    dirname = os.path.join(os.getcwd(), "symbols")

    hdf5filename = os.path.join(dirname, listname + "_.hdf5")
    print("hdf5 filename = ", hdf5filename)
    #updatedquotes.to_hdf( hdf5filename, listname, mode='a',format='table',append=False,complevel=5,complib='blosc')
    updatedquotes.to_hdf(hdf5filename,
                         listname,
                         mode='a',
                         format='table',
                         append=False,
                         complevel=5,
                         complib='blosc')

    return
def cleanup_quotes(symbols_file,newquotesfirstdate, newquoteslastdate):
    # compare quotes currently on hdf with updated quotes from internet.
    print " ...   inside compareHDF_and_newquotes   ..."
    print " ... newquotesfirstdate = ", newquotesfirstdate
    print " ... newquoteslastdate = ", newquoteslastdate

    # get existing quotes from hdf
    (directory_name, file_name) = os.path.split(symbols_file)
    (shortname, extension) = os.path.splitext( file_name )

    print "file name for symbols = ","_"+shortname+"_"
    print "file type for symbols = ",extension

    # set up to write quotes to disk.

    if shortname == "symbols" :
        listname = "TAA-Symbols"
    elif shortname == "cmg_symbols" :
        listname = "CMG-Symbols"
    elif shortname == "Naz100_Symbols" :
        listname = "Naz100_Symbols"
    elif shortname == "biglist" :
        listname = "biglist-Symbols"
    elif shortname == "ETF_symbols" :
        listname = "ETF-Symbols"
    elif shortname == "ProvidentFundSymbols" :
        listname = "ProvidentFund-Symbols"
    elif shortname == "sp500_symbols" :
        listname = "SP500-Symbols"
    else :
        listname = shortname

    hdf5_directory = os.path.join( os.getcwd(), "symbols" )
    hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5")

    print ""
    print ""
    print "symbol_directory = ", directory_name
    print "symbols_file = ", symbols_file
    print "shortname, extension = ",shortname, extension
    print "hdf5filename = ",hdf5filename

    dataframeFromHDF = pd.read_hdf( hdf5filename, listname )
    x_hdf = dataframeFromHDF.as_matrix()
    x_hdf = x_hdf.swapaxes(0,1)
    date_hdf = dataframeFromHDF.index
    symbols_hdf = list(dataframeFromHDF.columns.values)

    # Clean up input quotes
    #  - infill interior NaN values using nearest good values to linearly interpolate
    #  - copy first valid quote to from valid date to all earlier positions
    #for ii in range(x.shape[0]):
    for ii,isymbolupdate in enumerate(symbols_hdf):
        xupdate = dataframeFromHDF[isymbolupdate]
        '''
        if isymbolupdate == 'SBUX':
            import pdb
            pdb.set_trace()
        '''
        print " ... cleanup_quotes ... symbol = ", isymbolupdate
        xupdate = cleanspikes(xupdate)
        xupdate = interpolate(xupdate)
        xupdate = cleantobeginning(xupdate)
        dataframeFromHDF[isymbolupdate] = xupdate.copy()
        #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float')
        #xupdate[ii,:] = interpolate(xupdate[ii,:])
        #xupdate[ii,:] = cleantobeginning(xupdate[ii,:])

    dataframeFromHDF.to_hdf( hdf5filename, listname, mode='a',format='table',append=False,complevel=5,complib='blosc')

    return
def UpdateHDF5( symbol_directory, symbols_file ):

    ##
    ##  Update symbols in 'symbols_file' with quotes more recent than last update.
    ##

    filename = os.path.join(symbol_directory, symbols_file)

    x, symbols, datearray, quote, listname = loadQuotes_fromHDF( filename )

    # get last date in hdf5 archive
    #from datetime import datetime
    import datetime

    date = quote.index
    lastdate = getLastDateFromHDF5( symbol_directory, symbols_file )

    ##
    ## Get quotes for each symbol in list
    ## process dates.
    ## Clean up quotes.
    ## Make a plot showing all symbols in list
    ##

    # locate symbols added to list that aren't in HDF5 file
    symbols_in_list = readSymbolList( filename, verbose=False)
    symbols_in_HDF5 = list(quote.columns.values)
    new_symbols = [x for x in symbols_in_list if x  not in symbols_in_HDF5]

    # write new symbols to temporary file
    if len(new_symbols) > 0:
        # write new symbols to temporary file
        tempfilename = os.path.join(symbol_directory, "newsymbols_tempfile.txt")
        OUTFILE = open(tempfilename,"w",0)
        for i,isymbol in enumerate(new_symbols):
            print "new symbol = ", isymbol
            OUTFILE.write(str(isymbol) + "\n")

        newquotesfirstdate = datetime.date(1991,1,1)
        newquoteslastdate = datetime.date.today()

        # print dates to be used
        print "dates for new symbol found = ", newquotesfirstdate, newquoteslastdate

        newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(tempfilename, newquotesfirstdate, newquoteslastdate)

        print " security values check: ",newadjClose[isnan(newadjClose)].shape

        newdates = []
        for i in range(newdatearray.shape[0]):
            newdates.append(str(newdatearray[i]))
        #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float)
        quotes_NewSymbols = pd.DataFrame(newadjClose.swapaxes(0,1), index=newdates, columns=newsymbols)

    ##
    ## Get quotes for each symbol in list
    ## process dates.
    ## Clean up quotes.
    ## Make a plot showing all symbols in list
    ##

    if type(lastdate) == str:
        newquotesfirstdate = datetime.date(*[int(val) for val in lastdate.split('-')])
    else:
        newquotesfirstdate = lastdate
    today = datetime.datetime.now()
    tomorrow = today + timedelta( days=1 )
    newquoteslastdate = tomorrow


    newadjClose, symbols, newdatearray = arrayFromQuotesForList(filename, newquotesfirstdate, newquoteslastdate)

    print " security values check: ",newadjClose[isnan(newadjClose)].shape

    newdates = []
    for i in range(newdatearray.shape[0]):
        newdates.append(str(newdatearray[i]))
    quoteupdate = pd.DataFrame( newadjClose.swapaxes(0,1), index=newdates, columns=symbols)

    updatedquotes = quoteupdate.combine_first( quote )

    ###################
    from functions.TAfunctions import cleanspikes
    from functions.TAfunctions import interpolate
    from functions.TAfunctions import cleantobeginning

    # clean up quotes for missing values and varying starting date
    #x = quote.as_matrix().swapaxes(0,1)
    xupdate = updatedquotes.values.T
    symbolListupdate = list(updatedquotes.columns.values)

    # Clean up input quotes
    #  - infill interior NaN values using nearest good values to linearly interpolate
    #  - copy first valid quote to from valid date to all earlier positions
    #for ii in range(x.shape[0]):
    for ii,isymbolupdate in enumerate(symbolListupdate):
        '''
        if ii%5 == 0:
            print "  ... progress:  ii, symbol = ", ii, isymbolupdate
        '''
        xupdate = updatedquotes[isymbolupdate].values
        xupdate = cleanspikes(xupdate)
        xupdate = interpolate(xupdate)
        xupdate = cleantobeginning(xupdate)
        updatedquotes[isymbolupdate] = xupdate
    ###################

    if len(new_symbols) > 0:
        print "\n\n\n...quotes_NewSymbols = ", quotes_NewSymbols.info()
        print "\n\n\n...updatedquotes = ", updatedquotes.info()
        for isymbol in new_symbols:
            updatedquotes[isymbol] = quotes_NewSymbols[isymbol]
        print "\n\n\n...merged updatedquotes = ", updatedquotes.info()


    CASHadjClose = np.ones( (len(updatedquotes.index)), float ) * 100.
    for i in range(CASHadjClose.shape[0]):
        if i%10 == 0:
            CASHadjClose[i] = CASHadjClose[i-1] + .01
        else:
            CASHadjClose[i] = CASHadjClose[i-1]

    updatedquotes['CASH'] = CASHadjClose


    # set up to write quotes to disk.
    dirname = os.path.join( os.getcwd(), "symbols" )

    hdf5filename = os.path.join( dirname, listname + "_.hdf5" )
    print "hdf5 filename = ",hdf5filename
    updatedquotes.to_hdf( hdf5filename, listname, mode='a',format='table',append=False,complevel=5,complib='blosc')

    return
def compareHDF_and_newquotes(symbols_file,newquotesfirstdate, newquoteslastdate):
    # compare quotes currently on hdf with updated quotes from internet.
    print " ...   inside compareHDF_and_newquotes   ..."
    print " ... newquotesfirstdate = ", newquotesfirstdate
    print " ... newquoteslastdate = ", newquoteslastdate

    # get existing quotes from hdf
    (directory_name, file_name) = os.path.split(symbols_file)
    (shortname, extension) = os.path.splitext( file_name )

    print "file name for symbols = ","_"+shortname+"_"
    print "file type for symbols = ",extension

    # set up to write quotes to disk.

    if shortname == "symbols" :
        listname = "TAA-Symbols"
    elif shortname == "cmg_symbols" :
        listname = "CMG-Symbols"
    elif shortname == "Naz100_Symbols" :
        listname = "Naz100_Symbols"
    elif shortname == "biglist" :
        listname = "biglist-Symbols"
    elif shortname == "ETF_symbols" :
        listname = "ETF-Symbols"
    elif shortname == "ProvidentFundSymbols" :
        listname = "ProvidentFund-Symbols"
    elif shortname == "sp500_symbols" :
        listname = "SP500-Symbols"
    else :
        listname = shortname

    hdf5_directory = os.path.join( os.getcwd(), "symbols" )
    hdf5filename = os.path.join(hdf5_directory, listname + "_.hdf5")

    print ""
    print ""
    print "symbol_directory = ", directory_name
    print "symbols_file = ", symbols_file
    print "shortname, extension = ",shortname, extension
    print "hdf5filename = ",hdf5filename

    dataframeFromHDF = pd.read_hdf( hdf5filename, listname )
    x_hdf = dataframeFromHDF.as_matrix()
    x_hdf = x_hdf.swapaxes(0,1)
    date_hdf = dataframeFromHDF.index
    symbols_hdf = list(dataframeFromHDF.columns.values)

    # get new quotes dataframe from internet
    newadjClose, newsymbols, newdatearray = arrayFromQuotesForList(symbols_file, newquotesfirstdate, newquoteslastdate)
    print " security values check: ",newadjClose[isnan(newadjClose)].shape
    newdates = []
    for i in range(newdatearray.shape[0]):
        newdates.append(str(newdatearray[i]))
    #quotes_NewSymbols = pd.DataFrame(newadjClose, [symbols,newdates], dtype=float)
    dataframeFromInternet = pd.DataFrame(newadjClose.swapaxes(0,1), index=newdates, columns=newsymbols)

    ###################
    from functions.TAfunctions import interpolate
    from functions.TAfunctions import cleantobeginning

    # clean up quotes for missing values and varying starting date
    #x = quote.as_matrix().swapaxes(0,1)
    ##xupdate = dataframeFromInternet.values.T
    symbolListupdate = list(dataframeFromInternet.columns.values)

    # Clean up input quotes
    #  - infill interior NaN values using nearest good values to linearly interpolate
    #  - copy first valid quote to from valid date to all earlier positions
    #for ii in range(x.shape[0]):
    for ii,isymbolupdate in enumerate(symbolListupdate):
        xupdate = dataframeFromInternet[isymbolupdate]
        '''
        if isymbolupdate == 'SBUX':
            import pdb
            pdb.set_trace()
        '''
        print " isymbolupdate,xupdate = ", isymbolupdate,xupdate.as_matrix()
        xupdate = cleanspikes(xupdate)
        xupdate = interpolate(xupdate)
        xupdate = cleantobeginning(xupdate)
        #xupdate[ii,:] = np.array(xupdate[ii,:]).astype('float')
        #xupdate[ii,:] = interpolate(xupdate[ii,:])
        #xupdate[ii,:] = cleantobeginning(xupdate[ii,:])
    ###################

    x_net = dataframeFromInternet.as_matrix()
    x_net = x_net.swapaxes(0,1)
    date_net = dataframeFromInternet.index
    symbols_net = list(dataframeFromInternet.columns.values)

    # find joined symbols
    symbols_all = symbols_hdf + symbols_net
    symbols_all = list(set(symbols_all))
    symbols_all.sort()

    for isymbol in symbols_all:
        # find date range for shorter of quotes update from net or on hdf
        try:
            hdf_index = symbols_hdf.index(isymbol)
            firstindexup_hdf = np.argmax(np.clip(x_hdf[hdf_index,:]/x_hdf[hdf_index,0],1.,1.+1.e-5))
            firstindexdown_hdf = np.argmin(np.clip(x_hdf[hdf_index,:]/x_hdf[hdf_index,0],1.-1.e-5,1.))
            firstindex_hdf = max(firstindexup_hdf,firstindexdown_hdf)

            net_index = symbols_net.index(isymbol)
            firstindexup_net = np.argmax(np.clip(x_net[net_index,:]/x_net[net_index,0],1.,1.+1.e-5))
            firstindexdown_net = np.argmin(np.clip(x_net[net_index,:]/x_net[net_index,0],1.-1.e-5,1.))
            firstindex_net = max(firstindexup_net,firstindexdown_net)
            firstDate = max( date_net[firstindex_net], date_hdf[firstindex_hdf] )
            lastDate = min( date_net[-1], date_hdf[-1] )

            values_hdf = x_hdf[hdf_index,list(date_hdf).index(firstDate):list(date_hdf).index(lastDate)+1]
            values_net = x_net[net_index,list(date_net).index(firstDate):list(date_net).index(lastDate)+1]

            if False in values_hdf==values_net:
                print " ... **** symbol ", format(isymbol,'5s'), " is different in hdf and update from internet (", firstDate, " to ", lastDate, " )"
            else:
                print " ... symbol ", format(isymbol,'5s'), " is same in hdf and update from internet (", firstDate, " to ", lastDate, " )"
        except:
            print " ... **** **** symbol ", format(isymbol,'5s'), " not matched in hdf and update from internet"

        '''
        if isymbol == 'SBUX':
            print " .... firstdate, lastdate = ", firstDate, lastDate
            datesForPlot = date_hdf[list(date_hdf).index(firstDate):list(date_hdf).index(lastDate)+1]
            _datesForPlot1=[]
            for i in range(len(datesForPlot)):
                datestr = datesForPlot[i]
                date_newformat = datetime.date(*[int(val) for val in datestr.split('-')])
                #date_newformat = datestr
                _datesForPlot1.append(date_newformat)
                iindex = list(date_hdf).index(firstDate) + i
                #print "i,date_newformat,values_net['SBUX'] = ", i,date_newformat,x_net[net_index,iindex]

            print " .... _datesForPlot1 = ", _datesForPlot1
            plt.figure()
            plt.grid()
            plt.plot(_datesForPlot1,values_hdf)
            plt.plot(_datesForPlot1,values_hdf,'b.')

            datesForPlot = date_net[list(date_net).index(firstDate):list(date_net).index(lastDate)+1]
            _datesForPlot2=[]
            for i in range(len(datesForPlot)):
                datestr = datesForPlot[i]
                date_newformat = datetime.date(*[int(val) for val in datestr.split('-')])
                #date_newformat = datestr
                _datesForPlot2.append(date_newformat)
            print "\n\n\n .... _datesForPlot2 = ", _datesForPlot2
            plt.plot(_datesForPlot2,values_net)
            plt.plot(_datesForPlot2,values_net,'g.')
        '''

    return