def __init__(self, parent, data, fit_period, optimiser, cleaning): if cleaning: # Generate 'must have' from the period we need # because if we're bootstrapping could be completely different # periods current_period_data = data[fit_period.period_start: fit_period.period_end] must_haves = must_have_item(current_period_data) else: must_haves = None if fit_period.no_data: # no data to fit with diag = None size = current_period_data.shape[1] weights_with_nan = [np.nan / size] * size weights = weights_with_nan if cleaning: weights = clean_weights(weights, must_haves) else: # we have data subset_fitting_data = data[fit_period.fit_start:fit_period.fit_end] (weights, diag) = optimiser.call(subset_fitting_data, cleaning, must_haves) ## setattr(self, "diag", diag) setattr(self, "weights", weights)
def __init__(self, parent, data, fit_period, optimiser, cleaning): if cleaning: ### Generate 'must have' from the period we need ### because if we're bootstrapping could be completely different periods current_period_data = data[fit_period.period_start:fit_period. period_end] must_haves = must_have_item(current_period_data) else: must_haves = None if fit_period.no_data: ## no data to fit with diag = None size = current_period_data.shape[1] weights_with_nan = [np.nan / size] * size weights = weights_with_nan if cleaning: weights = clean_weights(weights, must_haves) else: ## we have data subset_fitting_data = data[fit_period.fit_start:fit_period.fit_end] (weights, diag) = optimiser.call(subset_fitting_data, cleaning, must_haves) ## setattr(self, "diag", diag) setattr(self, "weights", weights)
def __init__(self, data, frequency="W", date_method="expanding", rollyears=20, dict_group=dict(), boring_offdiag=0.99, cleaning=True, **kwargs): cleaning=str2Bool(cleaning) group_dict=group_dict_from_natural(dict_group) data=df_from_list(data) column_names=list(data.columns) data=data.resample(frequency, how="last") fit_dates = generate_fitting_dates(data, date_method=date_method, rollyears=rollyears) size=len(column_names) corr_with_no_data=boring_corr_matrix(size, offdiag=boring_offdiag) corr_list=[] print(__file__ + ":" + str(inspect.getframeinfo(inspect.currentframe())[:3][1]) + ":" +"Correlation estimate") for fit_period in fit_dates: print(__file__ + ":" + str(inspect.getframeinfo(inspect.currentframe())[:3][1]) + ":" +"Estimating from %s to %s" % (fit_period.period_start, fit_period.period_end)) if fit_period.no_data: corr_with_nan=boring_corr_matrix(size, offdiag=np.nan, diag=np.nan) corrmat=corr_with_nan else: data_for_estimate=data[fit_period.fit_start:fit_period.fit_end] corrmat=correlation_single_period(data_for_estimate, **kwargs) if cleaning: current_period_data=data[fit_period.fit_start:fit_period.fit_end] must_haves=must_have_item(current_period_data) corrmat=clean_correlation(corrmat, corr_with_no_data, must_haves) corr_list.append(corrmat) setattr(self, "corr_list", corr_list) setattr(self, "columns", column_names) setattr(self, "fit_dates", fit_dates)
def get_must_have_dict_from_data(data: pd.DataFrame) -> dict: must_have_list = must_have_item(data) list_of_asset_names = list(data.columns) must_have_dict = dict([ (asset_name, must_have) for asset_name, must_have in zip(list_of_asset_names, must_have_list) ]) return must_have_dict
def __init__(self, data, log=logtoscreen("optimiser"), frequency="W", date_method="expanding", rollyears=20, dict_group=dict(), boring_offdiag=0.99, cleaning=True, **kwargs): cleaning=str2Bool(cleaning) ## grouping dictionary, convert to faster, algo friendly, form group_dict=group_dict_from_natural(dict_group) data=df_from_list(data) column_names=list(data.columns) data=data.resample(frequency, how="last") ### Generate time periods fit_dates = generate_fitting_dates(data, date_method=date_method, rollyears=rollyears) size=len(column_names) corr_with_no_data=boring_corr_matrix(size, offdiag=boring_offdiag) ## create a list of correlation matrices corr_list=[] print(__file__ + ":" + str(inspect.getframeinfo(inspect.currentframe())[:3][1]) + ":" +"Correlation estimate") ## Now for each time period, estimate correlation for fit_period in fit_dates: print(__file__ + ":" + str(inspect.getframeinfo(inspect.currentframe())[:3][1]) + ":" +"Estimating from %s to %s" % (fit_period.period_start, fit_period.period_end)) if fit_period.no_data: ## no data to fit with corr_with_nan=boring_corr_matrix(size, offdiag=np.nan, diag=np.nan) corrmat=corr_with_nan else: data_for_estimate=data[fit_period.fit_start:fit_period.fit_end] corrmat=correlation_single_period(data_for_estimate, **kwargs) if cleaning: current_period_data=data[fit_period.fit_start:fit_period.fit_end] must_haves=must_have_item(current_period_data) # means we can use earlier correlations with sensible values corrmat=clean_correlation(corrmat, corr_with_no_data, must_haves) corr_list.append(corrmat) setattr(self, "corr_list", corr_list) setattr(self, "columns", column_names) setattr(self, "fit_dates", fit_dates)
def clean_corr_matrix_given_data(self, fit_period: fitDates, data_for_correlation: pd.DataFrame): if fit_period.no_data: return self current_period_data = data_for_correlation[ fit_period.fit_start:fit_period.fit_end] # must_haves are items with data in this period, so we need some # kind of correlation must_haves = must_have_item(current_period_data) clean_correlation = self.clean_correlations(must_haves) return clean_correlation
def calculate(self, fit_period): """ Work out the correlation for a single period :param fit_period: Specification of the period we're calculating the correlation for :return: np.array of correlation matrix """ cleaning = self.cleaning corr_with_no_data = self.corr_with_no_data corr_for_cleaning = self.corr_for_cleaning data_as_df = self.data_as_df kwargs = self.kwargs ew_lookback_corrected = self.ew_lookback_corrected floor_at_zero = self.floor_at_zero if fit_period.no_data: # no data to fit with corrmat = corr_with_no_data else: data_for_estimate = data_as_df[fit_period.fit_start:fit_period. fit_end] corrmat = correlation_calculator(data_for_estimate, ew_lookback=ew_lookback_corrected, **kwargs) if cleaning: current_period_data = data_as_df[fit_period.fit_start:fit_period. fit_end] # must_haves are items with data in this period, so we need some # kind of correlation must_haves = must_have_item(current_period_data) # means we can use earlier correlations with sensible values corrmat = clean_correlation(corrmat, corr_for_cleaning, must_haves) # can't do this earlier as might have nans if floor_at_zero: corrmat[corrmat < 0] = 0.0 return corrmat
def calculate(self, fit_period): """ Work out the correlation for a single period :param fit_period: Specification of the period we're calculating the correlation for :return: np.array of correlation matrix """ cleaning = self.cleaning corr_with_no_data = self.corr_with_no_data corr_for_cleaning = self.corr_for_cleaning data_as_df = self.data_as_df kwargs = self.kwargs ew_lookback_corrected = self.ew_lookback_corrected floor_at_zero = self.floor_at_zero if fit_period.no_data: # no data to fit with corrmat = corr_with_no_data else: data_for_estimate = data_as_df[fit_period.fit_start: fit_period.fit_end] corrmat = correlation_calculator( data_for_estimate, ew_lookback=ew_lookback_corrected, **kwargs) if cleaning: current_period_data = data_as_df[fit_period.fit_start: fit_period.fit_end] # must_haves are items with data in this period, so we need some kind of correlation must_haves = must_have_item(current_period_data) # means we can use earlier correlations with sensible values corrmat = clean_correlation(corrmat, corr_for_cleaning, must_haves) # can't do this earlier as might have nans if floor_at_zero: corrmat[corrmat < 0] = 0.0 return corrmat
def __init__(self, parent, data, fit_period, optimiser, cleaning): if cleaning: current_period_data=data[fit_period.period_start:fit_period.period_end] must_haves=must_have_item(current_period_data) else: must_haves=None if fit_period.no_data: diag=None size=current_period_data.shape[1] weights_with_nan=[np.nan/size]*size weights=weights_with_nan if cleaning: weights=clean_weights(weights, must_haves) else: subset_fitting_data=data[fit_period.fit_start:fit_period.fit_end] (weights, diag)=optimiser.call(subset_fitting_data, cleaning, must_haves) setattr(self, "diag", diag) setattr(self, "weights", weights)
def __init__(self, data, log=logtoscreen("optimiser"), frequency="W", date_method="expanding", rollyears=20, dict_group=dict(), boring_offdiag=0.99, cleaning=True, **kwargs): """ We generate a correlation from either a pd.DataFrame, or a list of them if we're pooling Its important that forward filling, or index / ffill / diff has been done before we begin :param data: Data to get correlations from :type data: pd.DataFrame or list if pooling :param frequency: Downsampling frequency. Must be "D", "W" or bigger :type frequency: str :param date_method: Method to pass to generate_fitting_dates :type date_method: str :param roll_years: If date_method is "rolling", number of years in window :type roll_years: int :param dict_group: dictionary of groupings; used to replace missing values :type dict_group: dict :param boring_offdiag: Value used in creating 'boring' matrix, for when no data :type boring_offdiag: float :param **kwargs: passed to correlation_single_period :returns: CorrelationList """ cleaning = str2Bool(cleaning) # grouping dictionary, convert to faster, algo friendly, form group_dict = group_dict_from_natural(dict_group) data = df_from_list(data) column_names = list(data.columns) data = data.resample(frequency).last() # Generate time periods fit_dates = generate_fitting_dates(data, date_method=date_method, rollyears=rollyears) size = len(column_names) corr_with_no_data = boring_corr_matrix(size, offdiag=boring_offdiag) # create a list of correlation matrices corr_list = [] progress = progressBar(len(fit_dates), "Estimating correlations") # Now for each time period, estimate correlation for fit_period in fit_dates: progress.iterate() if fit_period.no_data: # no data to fit with corr_with_nan = boring_corr_matrix(size, offdiag=np.nan, diag=np.nan) corrmat = corr_with_nan else: data_for_estimate = data[fit_period.fit_start:fit_period. fit_end] corrmat = correlation_single_period(data_for_estimate, **kwargs) if cleaning: current_period_data = data[fit_period.fit_start:fit_period. fit_end] must_haves = must_have_item(current_period_data) # means we can use earlier correlations with sensible values corrmat = clean_correlation(corrmat, corr_with_no_data, must_haves) corr_list.append(corrmat) setattr(self, "corr_list", corr_list) setattr(self, "columns", column_names) setattr(self, "fit_dates", fit_dates)
def __init__(self, data, log=logtoscreen("optimiser"), frequency="W", date_method="expanding", rollyears=20, dict_group=dict(), boring_offdiag=0.99, cleaning=True, **kwargs): """ We generate a correlation from eithier a pd.DataFrame, or a list of them if we're pooling Its important that forward filling, or index / ffill / diff has been done before we begin :param data: Data to get correlations from :type data: pd.DataFrame or list if pooling :param frequency: Downsampling frequency. Must be "D", "W" or bigger :type frequency: str :param date_method: Method to pass to generate_fitting_dates :type date_method: str :param roll_years: If date_method is "rolling", number of years in window :type roll_years: int :param dict_group: dictionary of groupings; used to replace missing values :type dict_group: dict :param boring_offdiag: Value used in creating 'boring' matrix, for when no data :type boring_offdiag: float :param **kwargs: passed to correlation_single_period :returns: CorrelationList """ cleaning=str2Bool(cleaning) ## grouping dictionary, convert to faster, algo friendly, form group_dict=group_dict_from_natural(dict_group) data=df_from_list(data) column_names=list(data.columns) data=data.resample(frequency, how="last") ### Generate time periods fit_dates = generate_fitting_dates(data, date_method=date_method, rollyears=rollyears) size=len(column_names) corr_with_no_data=boring_corr_matrix(size, offdiag=boring_offdiag) ## create a list of correlation matrices corr_list=[] log.terse("Correlation estimate") ## Now for each time period, estimate correlation for fit_period in fit_dates: log.msg("Estimating from %s to %s" % (fit_period.period_start, fit_period.period_end)) if fit_period.no_data: ## no data to fit with corr_with_nan=boring_corr_matrix(size, offdiag=np.nan, diag=np.nan) corrmat=corr_with_nan else: data_for_estimate=data[fit_period.fit_start:fit_period.fit_end] corrmat=correlation_single_period(data_for_estimate, **kwargs) if cleaning: current_period_data=data[fit_period.fit_start:fit_period.fit_end] must_haves=must_have_item(current_period_data) # means we can use earlier correlations with sensible values corrmat=clean_correlation(corrmat, corr_with_no_data, must_haves) corr_list.append(corrmat) setattr(self, "corr_list", corr_list) setattr(self, "columns", column_names) setattr(self, "fit_dates", fit_dates)