def run_analysis(analysis, lag_period, forecast_period, leaders, predict_history, splits=True): r"""Run an analysis for a given model and group. First, the data are loaded for each member of the analysis group. Then, the target value is lagged for the ``forecast_period``, and any ``leaders`` are lagged as well. Each frame is split along the ``predict_date`` from the ``analysis``, and finally the train and test files are generated. Parameters ---------- analysis : alphapy.Analysis The analysis to run. lag_period : int The number of lagged features for the analysis. forecast_period : int The period for forecasting the target of the analysis. leaders : list The features that are contemporaneous with the target. predict_history : int The number of periods required for lookback calculations. splits : bool, optional If ``True``, then the data for each member of the analysis group are in separate files. Returns ------- analysis : alphapy.Analysis The completed analysis. """ # Unpack analysis name = analysis.name model = analysis.model group = analysis.group # Unpack model data predict_file = model.predict_file test_file = model.test_file train_file = model.train_file # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] predict_date = model.specs['predict_date'] predict_mode = model.specs['predict_mode'] separator = model.specs['separator'] target = model.specs['target'] train_date = model.specs['train_date'] # Calculate split date logger.info("Analysis Dates") split_date = subtract_days(predict_date, predict_history) logger.info("Train Date: %s", train_date) logger.info("Split Date: %s", split_date) logger.info("Test Date: %s", predict_date) # Load the data frames data_frames = load_frames(group, directory, extension, separator, splits) # Create dataframes if predict_mode: # create predict frame predict_frame = pd.DataFrame() else: # create train and test frames train_frame = pd.DataFrame() test_frame = pd.DataFrame() # Subset each individual frame and add to the master frame leaders.extend([TAG_ID]) for df in data_frames: try: tag = df[TAG_ID].unique()[0] except: tag = 'Unknown' first_date = df.index[0] last_date = df.index[-1] logger.info("Analyzing %s from %s to %s", tag, first_date, last_date) # sequence leaders, laggards, and target(s) df = sequence_frame(df, target, forecast_period, leaders, lag_period) # get frame subsets if predict_mode: new_predict = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_predict) > 0: predict_frame = predict_frame.append(new_predict) else: logger.info( "Prediction frame %s has zero rows. Check prediction date.", tag) else: # split data into train and test new_train = df.loc[(df.index >= train_date) & (df.index < split_date)] if len(new_train) > 0: new_train = new_train.dropna() train_frame = train_frame.append(new_train) new_test = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_test) > 0: # check if target column has NaN values nan_count = df[target].isnull().sum() forecast_check = forecast_period - 1 if nan_count != forecast_check: logger.info("%s has %d records with NaN targets", tag, nan_count) # drop records with NaN values in target column new_test = new_test.dropna(subset=[target]) # append selected records to the test frame test_frame = test_frame.append(new_test) else: logger.info( "Testing frame %s has zero rows. Check prediction date.", tag) else: logger.info( "Training frame %s has zero rows. Check data source.", tag) # Write out the frames for input into the AlphaPy pipeline directory = SSEP.join([directory, 'input']) if predict_mode: # write out the predict frame write_frame(predict_frame, directory, predict_file, extension, separator, index=True, index_label='date') else: # write out the train and test frames write_frame(train_frame, directory, train_file, extension, separator, index=True, index_label='date') write_frame(test_frame, directory, test_file, extension, separator, index=True, index_label='date') # Run the AlphaPy pipeline analysis.model = main_pipeline(model) # Return the analysis return analysis
def run_analysis(analysis, forecast_period, leaders, predict_history, splits=True): r"""Run an analysis for a given model and group. First, the data are loaded for each member of the analysis group. Then, the target value is lagged for the ``forecast_period``, and any ``leaders`` are lagged as well. Each frame is split along the ``predict_date`` from the ``analysis``, and finally the train and test files are generated. Parameters ---------- analysis : alphapy.Analysis The analysis to run. forecast_period : int The period for forecasting the target of the analysis. leaders : list The features that are contemporaneous with the target. splits : bool, optional If ``True``, then the data for each member of the analysis group are in separate files. Returns ------- analysis : alphapy.Analysis The completed analysis. """ # Unpack analysis name = analysis.name model = analysis.model group = analysis.group # Unpack model data predict_file = model.predict_file test_file = model.test_file test_labels = model.test_labels train_file = model.train_file # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] predict_date = model.specs['predict_date'] predict_mode = model.specs['predict_mode'] separator = model.specs['separator'] target = model.specs['target'] train_date = model.specs['train_date'] # Calculate split date split_date = subtract_days(predict_date, predict_history) # Load the data frames data_frames = load_frames(group, directory, extension, separator, splits) # Create dataframes if predict_mode: # create predict frame predict_frame = pd.DataFrame() else: # create train and test frames train_frame = pd.DataFrame() test_frame = pd.DataFrame() # Subset each individual frame and add to the master frame for df in data_frames: last_date = df.index[-1] # shift the target for the forecast period if forecast_period > 0: df[target] = df[target].shift(-forecast_period) df.index = df.index.shift(forecast_period, freq='D') # shift any leading features if necessary if leaders: df[leaders] = df[leaders].shift(-1) # get frame subsets if predict_mode: new_predict = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_predict) > 0: predict_frame = predict_frame.append(new_predict) else: logger.info( "A prediction frame has zero rows. Check prediction date.") else: # split data into train and test new_train = df.loc[(df.index >= train_date) & (df.index < split_date)] if len(new_train) > 0: # train frame new_train = new_train.dropna() train_frame = train_frame.append(new_train) # test frame new_test = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_test) > 0: if test_labels: new_test = new_test.dropna() test_frame = test_frame.append(new_test) else: logger.info( "A testing frame has zero rows. Check prediction date." ) else: logger.warning( "A training frame has zero rows. Check data source.") # Write out the frames for input into the AlphaPy pipeline directory = SSEP.join([directory, 'input']) if predict_mode: # write out the predict frame write_frame(predict_frame, directory, predict_file, extension, separator, index=True, index_label='date') else: # write out the train and test frames write_frame(train_frame, directory, train_file, extension, separator, index=True, index_label='date') write_frame(test_frame, directory, test_file, extension, separator, index=True, index_label='date') # Run the AlphaPy pipeline analysis.model = main_pipeline(model) # Return the analysis return analysis