Example #1
0
def get_correlation_monthly_returns(path, df_prod_bf, prod, product_type,
                                    output_path):
    # Adjust for splits and create backward dividend adjusted file
    if product_type == 'index':
        columns = [
            'date', 'open', 'high', 'low', 'close', 'volume', 'dividend'
        ]
        prod_file = path + prod[0] + '/' + prod + '.csv'
        df_prod = pd.read_csv(prod_file,
                              names=columns,
                              parse_dates=['date'],
                              date_parser=parse)
        returns = compute_daily_log_returns(df_prod['close'].values)
    else:
        adjust_for_splits(path, [prod], product_type, output_path)
        backward_adjust_dividends(output_path, [prod], product_type)
        # Read data from CSVs to dataframe
        prod_file = output_path + prod + '_backward_dividend_adjusted.csv'
        df_prod = pd.read_csv(prod_file,
                              parse_dates=['date'],
                              date_parser=parse)
        returns = compute_daily_log_returns(
            df_prod['backward_adjusted_close'].values)

    returns_bf = compute_daily_log_returns(
        df_prod_bf['backward_adjusted_close'].values)
    labels_monthly_returns_prod_bf = convert_daily_returns_to_yyyymm_monthly_returns_pair(
        df_prod_bf['date'], returns_bf)
    labels_monthly_returns_prod = convert_daily_returns_to_yyyymm_monthly_returns_pair(
        df_prod['date'], returns)

    return compute_correlation(labels_monthly_returns_prod_bf,
                               labels_monthly_returns_prod)
def auto_backfill(path, output_path, prod_backfill, plot_option):
    """
    Methodology for automatic backfilling
    1) Look in folder of assets(funds or indices) whose prices are available till 1995.
    2) Find asset in this folder with maximum correlation in monthly returns.
    3) Use beta-adjusted values of the chosen fund or index to backfill.
    4) If maximum correlation is below a certain threshold don't backfill. This asset will be advised to be backfilled manually.
    """
    if os.path.isfile(output_path+prod_backfill+'_backfilled'+'.csv'):
        print "%s already backfilled"%prod_backfill
        return True
    prod_backfill_file = path+prod_backfill[0]+'/'+prod_backfill+'.csv'
    adjust_for_splits(path, [prod_backfill], 'etf', output_path)
    backward_adjust_dividends(output_path, [prod_backfill], 'etf')
    #df_prod_bf = pd.read_csv(path+prod_backfill+'_split_adjusted.csv')
    df_prod_bf = pd.read_csv(output_path+prod_backfill+'_backward_dividend_adjusted.csv', parse_dates=['date'], date_parser=parse)

    products = ['VTSMX', 'GMHBX', 'VBMFX', 'VEIEX', '^MXEA', '@DJCI', 'VWAHX', 'VFISX', 'VIPSX', 'VSIIX', 'CVK', 'DFSVX', \
                '^XMSC', 'MARFX', 'XMSW', 'VIVAX', 'VGTSX', 'VGSIX', 'VEIEX', 'RUI', 'PEBIX', 'SPX', '@GU6']
    products_type = ['fund', 'fund', 'fund', 'fund', 'index', 'index', 'fund', 'fund', 'fund', 'fund', 'index', 'fund', \
                 'index', 'fund', 'index', 'fund', 'fund', 'fund', 'fund', 'index', 'fund', 'index', 'index']
    correlations = []
    for i in xrange(len(products)):
        correlations.append(get_correlation_monthly_returns(path, df_prod_bf, products[i], products_type[i], output_path))
    
    #print zip(products, correlations)
    if max(correlations) < 0.7:
        print ('Product %s not highly correlated with any of our proxy products. Please backfill manually.'%prod_backfill)
        return False

    backfill_choice = correlations.index(max(correlations))
    print "%s being backfilled with %s. Correlation between them is: %0.3f" % (prod_backfill, products[backfill_choice], max(correlations)) ,
    df_prod_bf, starting_date = backfill_each_product(path, df_prod_bf, products[backfill_choice], products_type[backfill_choice], beta_adjust=True, output_path=output_path)
    print " from %s" % (starting_date)
    df_prod_bf['date'] = df_prod_bf['date'].apply(lambda x: x.strftime('%Y%m%d'))

    df_prod_bf.to_csv(output_path+prod_backfill+'_backfilled'+'.csv',index=False)
    
    if plot_option.lower() == 'y':
        df_prod_bf = pd.read_csv(output_path+prod_backfill+'_backfilled'+'.csv')
        ax = df_prod_bf.plot(y='close',use_index=False)
        starting_index = df_prod_bf[df_prod_bf['date']==int(starting_date)].index[0]
        ax.axvline(x=starting_index)
        plt.savefig(output_path+"plot_"+prod_backfill+".png")
    
    return True 
def get_correlation_monthly_returns(path, df_prod_bf, prod, product_type, output_path):
    # Adjust for splits and create backward dividend adjusted file
    if product_type == 'index':
        columns = ['date','open','high','low','close','volume','dividend']
        prod_file = path+prod[0]+'/'+prod+'.csv'
        df_prod = pd.read_csv(prod_file,names=columns, parse_dates=['date'], date_parser=parse)
        returns = compute_daily_log_returns(df_prod['close'].values)
    else:
        adjust_for_splits(path, [prod], product_type, output_path)
        backward_adjust_dividends(output_path, [prod], product_type)
        # Read data from CSVs to dataframe
        prod_file = output_path+prod+'_backward_dividend_adjusted.csv'
        df_prod = pd.read_csv(prod_file, parse_dates=['date'], date_parser=parse)
        returns = compute_daily_log_returns(df_prod['backward_adjusted_close'].values)
            
    returns_bf = compute_daily_log_returns(df_prod_bf['backward_adjusted_close'].values)
    labels_monthly_returns_prod_bf = convert_daily_returns_to_yyyymm_monthly_returns_pair(df_prod_bf['date'], returns_bf)
    labels_monthly_returns_prod = convert_daily_returns_to_yyyymm_monthly_returns_pair(df_prod['date'], returns)

    return compute_correlation(labels_monthly_returns_prod_bf, labels_monthly_returns_prod)
def backfill_each_product(path, df_prod_bf, prod, product_type, beta_adjust=False, output_path=None):
    """
    If the assets are A and B and price of A when unavailable is being backfilled using price of B
    A_close_price_backfill_i/A_close_price_backfill_(i+1) = B_backward_adjusted_close_i/ B_backward_adjusted_close_(i+1)
    Asset prices of A where available are used without modification.

    Args:
        df_prod_bf(dataframe): Dataframe of product being backfilled
        prod(string): Current products being used to backfill
        product_types(string): Type(etf/fund) of current product. Backward adjusted pricing will difer based on this.

    Returns:
        df_prod_bf(dataframe): backfilled data DataFrame
        starting_index(int): index at which backfilling was started
    """
    if output_path == None:
        output_path = path
    # Adjust for splits and create backward dividend adjusted file
    if not product_type == 'index':
        adjust_for_splits(path, [prod], product_type, output_path)
        backward_adjust_dividends(output_path, [prod], product_type)
        # Read data from CSVs to datarframe
        prod_file = output_path+prod+'_backward_dividend_adjusted.csv'
        df_prod = pd.read_csv(prod_file, parse_dates=['date'], date_parser=parse)
    else:
        columns = ['date','open','high','low','close','volume','dividend']
        prod_file = path+prod[0]+'/'+prod+'.csv'
        df_prod = pd.read_csv(prod_file,names=columns, parse_dates=['date'], date_parser=parse)

    if beta_adjust == True:
        returns_bf = df_prod_bf.close / df_prod_bf.close.shift(1)
        std_prod_bf = np.std(returns_bf.values[1:])
        returns = df_prod.close / df_prod.close.shift(1)
        std_prod = np.std(returns.values[1:])
        beta_factor = std_prod_bf/std_prod
    else:
        beta_factor = 1.0 

    # Get last date available in df_prod_bf
    # This is the starting date for backfill
    starting_date = df_prod_bf['date'].iloc[0]
    # Get list of all dates after which data is available in df_prod
    # Select the index of closest date to starting date as starting index
    starting_index = df_prod[df_prod['date']>=starting_date].index[0]

    #Print gap period in between them
    #print "Gap period:" + str(starting_date - df_prod['date'].iloc[starting_index])
    
    # Add last last available entry in dataframe to new dataframe 
    # This dataframe will store all backfilled  values
    df_backfilled = pd.DataFrame(data=df_prod_bf.head(1).reset_index(drop=True))
    # Keeps track index in new backfilled dataframe
    n = 1

    if not product_type == 'index':
        df_prod['backward_adjusted_close'] = beta_factor*df_prod['backward_adjusted_close']
    else:
        df_prod['close'] = beta_factor * df_prod['close']
    
    for i in xrange(starting_index-1,-1,-1):
        # backfill_ratio is kept to ensure daily log returns is same between both poducts when data is unavailable
        if not product_type == 'index':
            backfill_ratio = df_prod['backward_adjusted_close'].iloc[i]/df_prod['backward_adjusted_close'].iloc[i+1]
        else:
            backfill_ratio = df_prod['close'].iloc[i]/df_prod['close'].iloc[i+1]
        df_backfilled.loc[n,'date'] = df_prod.loc[i,'date']
        # A_price_backfill_i/A_price_backfill_(i+1) = B_backward_adjusted_close_i/ B_backward_adjusted_close_(i+1)        
        df_backfilled.loc[n,'open'] = df_backfilled.loc[n-1,'open'] * backfill_ratio
        df_backfilled.loc[n,'high'] = df_backfilled.loc[n-1,'high'] * backfill_ratio
        df_backfilled.loc[n,'low'] = df_backfilled.loc[n-1,'low'] * backfill_ratio
        df_backfilled.loc[n,'close'] = df_backfilled.loc[n-1,'close'] * backfill_ratio
        n += 1
    
    df_backfilled['open'] = df_backfilled['open']
    df_backfilled['low'] = df_backfilled['low']
    df_backfilled['high'] =  df_backfilled['high']
    df_backfilled['close'] = df_backfilled['close']
    df_backfilled['volume'] = 0  # because what we might use may be a fund and have no volume
    df_backfilled['dividend'] = 0  # assume no dividends are being given out as backwad adjusted price used already 
    
    # Concat the backfilled dataframe and the available dataframe for prod_bf
    df_prod_bf = pd.concat([df_backfilled.iloc[::-1][:-1],df_prod_bf])

    return df_prod_bf, starting_date.to_datetime().strftime("%Y%m%d")
def dividend_adjust(path, prod):
    shutil.copy2(path+prod+'_backfilled.csv', path+prod+'_split_adjusted.csv')
    backward_adjust_dividends(path, [prod], ['etf'])
    forward_adjust_dividends(path, [prod], ['etf'], path+'complete/')
Example #6
0
def backfill_each_product(path,
                          df_prod_bf,
                          prod,
                          product_type,
                          beta_adjust=False,
                          output_path=None):
    """
    If the assets are A and B and price of A when unavailable is being backfilled using price of B
    A_close_price_backfill_i/A_close_price_backfill_(i+1) = B_backward_adjusted_close_i/ B_backward_adjusted_close_(i+1)
    Asset prices of A where available are used without modification.

    Args:
        df_prod_bf(dataframe): Dataframe of product being backfilled
        prod(string): Current products being used to backfill
        product_types(string): Type(etf/fund) of current product. Backward adjusted pricing will difer based on this.

    Returns:
        df_prod_bf(dataframe): backfilled data DataFrame
        starting_index(int): index at which backfilling was started
    """
    if output_path == None:
        output_path = path
    # Adjust for splits and create backward dividend adjusted file
    if not product_type == 'index':
        adjust_for_splits(path, [prod], product_type, output_path)
        backward_adjust_dividends(output_path, [prod], product_type)
        # Read data from CSVs to datarframe
        prod_file = output_path + prod + '_backward_dividend_adjusted.csv'
        df_prod = pd.read_csv(prod_file,
                              parse_dates=['date'],
                              date_parser=parse)
    else:
        columns = [
            'date', 'open', 'high', 'low', 'close', 'volume', 'dividend'
        ]
        prod_file = path + prod[0] + '/' + prod + '.csv'
        df_prod = pd.read_csv(prod_file,
                              names=columns,
                              parse_dates=['date'],
                              date_parser=parse)

    if beta_adjust == True:
        returns_bf = df_prod_bf.close / df_prod_bf.close.shift(1)
        std_prod_bf = np.std(returns_bf.values[1:])
        returns = df_prod.close / df_prod.close.shift(1)
        std_prod = np.std(returns.values[1:])
        beta_factor = std_prod_bf / std_prod
    else:
        beta_factor = 1.0

    # Get last date available in df_prod_bf
    # This is the starting date for backfill
    starting_date = df_prod_bf['date'].iloc[0]
    # Get list of all dates after which data is available in df_prod
    # Select the index of closest date to starting date as starting index
    starting_index = df_prod[df_prod['date'] >= starting_date].index[0]

    #Print gap period in between them
    #print "Gap period:" + str(starting_date - df_prod['date'].iloc[starting_index])

    # Add last last available entry in dataframe to new dataframe
    # This dataframe will store all backfilled  values
    df_backfilled = pd.DataFrame(data=df_prod_bf.head(1).reset_index(
        drop=True))
    # Keeps track index in new backfilled dataframe
    n = 1

    if not product_type == 'index':
        df_prod['backward_adjusted_close'] = beta_factor * df_prod[
            'backward_adjusted_close']
    else:
        df_prod['close'] = beta_factor * df_prod['close']

    for i in xrange(starting_index - 1, -1, -1):
        # backfill_ratio is kept to ensure daily log returns is same between both poducts when data is unavailable
        if not product_type == 'index':
            backfill_ratio = df_prod['backward_adjusted_close'].iloc[
                i] / df_prod['backward_adjusted_close'].iloc[i + 1]
        else:
            backfill_ratio = df_prod['close'].iloc[i] / df_prod['close'].iloc[
                i + 1]
        df_backfilled.loc[n, 'date'] = df_prod.loc[i, 'date']
        # A_price_backfill_i/A_price_backfill_(i+1) = B_backward_adjusted_close_i/ B_backward_adjusted_close_(i+1)
        df_backfilled.loc[n,
                          'open'] = df_backfilled.loc[n - 1,
                                                      'open'] * backfill_ratio
        df_backfilled.loc[n,
                          'high'] = df_backfilled.loc[n - 1,
                                                      'high'] * backfill_ratio
        df_backfilled.loc[n, 'low'] = df_backfilled.loc[n - 1,
                                                        'low'] * backfill_ratio
        df_backfilled.loc[
            n, 'close'] = df_backfilled.loc[n - 1, 'close'] * backfill_ratio
        n += 1

    df_backfilled['open'] = df_backfilled['open']
    df_backfilled['low'] = df_backfilled['low']
    df_backfilled['high'] = df_backfilled['high']
    df_backfilled['close'] = df_backfilled['close']
    df_backfilled[
        'volume'] = 0  # because what we might use may be a fund and have no volume
    df_backfilled[
        'dividend'] = 0  # assume no dividends are being given out as backwad adjusted price used already

    # Concat the backfilled dataframe and the available dataframe for prod_bf
    df_prod_bf = pd.concat([df_backfilled.iloc[::-1][:-1], df_prod_bf])

    return df_prod_bf, starting_date.to_datetime().strftime("%Y%m%d")
Example #7
0
def dividend_adjust(path, prod):
    shutil.copy2(path + prod + '_backfilled.csv',
                 path + prod + '_split_adjusted.csv')
    backward_adjust_dividends(path, [prod], ['etf'])
    forward_adjust_dividends(path, [prod], ['etf'], path + 'complete/')
Example #8
0
def auto_backfill(path, output_path, prod_backfill, plot_option):
    """
    Methodology for automatic backfilling
    1) Look in folder of assets(funds or indices) whose prices are available till 1995.
    2) Find asset in this folder with maximum correlation in monthly returns.
    3) Use beta-adjusted values of the chosen fund or index to backfill.
    4) If maximum correlation is below a certain threshold don't backfill. This asset will be advised to be backfilled manually.
    """
    if os.path.isfile(output_path + prod_backfill + '_backfilled' + '.csv'):
        print "%s already backfilled" % prod_backfill
        return True
    prod_backfill_file = path + prod_backfill[0] + '/' + prod_backfill + '.csv'
    adjust_for_splits(path, [prod_backfill], 'etf', output_path)
    backward_adjust_dividends(output_path, [prod_backfill], 'etf')
    #df_prod_bf = pd.read_csv(path+prod_backfill+'_split_adjusted.csv')
    df_prod_bf = pd.read_csv(output_path + prod_backfill +
                             '_backward_dividend_adjusted.csv',
                             parse_dates=['date'],
                             date_parser=parse)

    products = ['VTSMX', 'GMHBX', 'VBMFX', 'VEIEX', '^MXEA', '@DJCI', 'VWAHX', 'VFISX', 'VIPSX', 'VSIIX', 'CVK', 'DFSVX', \
                '^XMSC', 'MARFX', 'XMSW', 'VIVAX', 'VGTSX', 'VGSIX', 'VEIEX', 'RUI', 'PEBIX', 'SPX', '@GU6']
    products_type = ['fund', 'fund', 'fund', 'fund', 'index', 'index', 'fund', 'fund', 'fund', 'fund', 'index', 'fund', \
                 'index', 'fund', 'index', 'fund', 'fund', 'fund', 'fund', 'index', 'fund', 'index', 'index']
    correlations = []
    for i in xrange(len(products)):
        correlations.append(
            get_correlation_monthly_returns(path, df_prod_bf, products[i],
                                            products_type[i], output_path))

    #print zip(products, correlations)
    if max(correlations) < 0.7:
        print(
            'Product %s not highly correlated with any of our proxy products. Please backfill manually.'
            % prod_backfill)
        return False

    backfill_choice = correlations.index(max(correlations))
    print "%s being backfilled with %s. Correlation between them is: %0.3f" % (
        prod_backfill, products[backfill_choice], max(correlations)),
    df_prod_bf, starting_date = backfill_each_product(
        path,
        df_prod_bf,
        products[backfill_choice],
        products_type[backfill_choice],
        beta_adjust=True,
        output_path=output_path)
    print " from %s" % (starting_date)
    df_prod_bf['date'] = df_prod_bf['date'].apply(
        lambda x: x.strftime('%Y%m%d'))

    df_prod_bf.to_csv(output_path + prod_backfill + '_backfilled' + '.csv',
                      index=False)

    if plot_option.lower() == 'y':
        df_prod_bf = pd.read_csv(output_path + prod_backfill + '_backfilled' +
                                 '.csv')
        ax = df_prod_bf.plot(y='close', use_index=False)
        starting_index = df_prod_bf[df_prod_bf['date'] == int(
            starting_date)].index[0]
        ax.axvline(x=starting_index)
        plt.savefig(output_path + "plot_" + prod_backfill + ".png")

    return True