def save_predictions(model, tag, partition): r"""Save the predictions to disk. Parameters ---------- model : alphapy.Model The model object to save. tag : str A unique identifier for the output files, e.g., a date stamp. partition : alphapy.Partition Reference to the dataset. Returns ------- preds : numpy array The prediction vector. probas : numpy array The probability vector. """ # Extract model parameters. directory = model.specs['directory'] extension = model.specs['extension'] model_type = model.specs['model_type'] separator = model.specs['separator'] # Get date stamp to record file creation timestamp = get_datestamp() # Specify input and output directories input_dir = SSEP.join([directory, 'input']) output_dir = SSEP.join([directory, 'output']) # Read the prediction frame file_spec = ''.join([datasets[partition], '*']) file_name = most_recent_file(input_dir, file_spec) file_name = file_name.split(SSEP)[-1].split(PSEP)[0] pf = read_frame(input_dir, file_name, extension, separator) # Cull records before the prediction date try: predict_date = model.specs['predict_date'] found_pdate = True except: found_pdate = False if found_pdate: pd_indices = pf[pf.date >= predict_date].index.tolist() pf = pf.iloc[pd_indices] else: pd_indices = pf.index.tolist() # Save predictions for all projects logger.info("Saving Predictions") output_file = USEP.join(['predictions', timestamp]) preds = model.preds[(tag, partition)].squeeze() if found_pdate: preds = np.take(preds, pd_indices) pred_series = pd.Series(preds, index=pd_indices) df_pred = pd.DataFrame(pred_series, columns=['prediction']) write_frame(df_pred, output_dir, output_file, extension, separator) # Save probabilities for classification projects probas = None if model_type == ModelType.classification: logger.info("Saving Probabilities") output_file = USEP.join(['probabilities', timestamp]) probas = model.probas[(tag, partition)].squeeze() if found_pdate: probas = np.take(probas, pd_indices) prob_series = pd.Series(probas, index=pd_indices) df_prob = pd.DataFrame(prob_series, columns=['probability']) write_frame(df_prob, output_dir, output_file, extension, separator) # Save ranked predictions logger.info("Saving Ranked Predictions") pf['prediction'] = pred_series if model_type == ModelType.classification: pf['probability'] = prob_series pf.sort_values('probability', ascending=False, inplace=True) else: pf.sort_values('prediction', ascending=False, inplace=True) output_file = USEP.join(['rankings', timestamp]) write_frame(pf, output_dir, output_file, extension, separator) # Return predictions and any probabilities return preds, probas
def run_system(model, system, group, intraday=False, quantity=1): r"""Run a system for a given group, creating a trades frame. Parameters ---------- model : alphapy.Model The model object with specifications. system : alphapy.System The system to run. group : alphapy.Group The group of symbols to trade. intraday : bool, optional If true, this is an intraday system. quantity : float, optional The amount to trade for each symbol, e.g., number of shares Returns ------- tf : pandas.DataFrame All of the trades for this ``group``. """ system_name = system.name logger.info("Generating Trades for System %s", system_name) # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Extract the group information. gname = group.name gmembers = group.members gspace = group.space # Run the system for each member of the group gtlist = [] for symbol in gmembers: # generate the trades for this member tlist = trade_system(model, system, gspace, intraday, symbol, quantity) if tlist: # add trades to global trade list for item in tlist: gtlist.append(item) else: logger.info("No trades for symbol %s", symbol) # Create group trades frame tf = None if gtlist: tspace = Space(system_name, "trades", group.space.fractal) gtlist = sorted(gtlist, key=lambda x: x[0]) tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states) tfname = frame_name(gname, tspace) system_dir = SSEP.join([directory, 'systems']) labels = ['date'] if intraday: labels.append('time') write_frame(tf, system_dir, tfname, extension, separator, index=True, index_label=labels) del tspace else: logger.info("No trades were found") # Return trades frame return tf
def training_pipeline(model): r"""AlphaPy Training Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- model : alphapy.Model The final results are stored in the model object. Raises ------ KeyError If the number of columns of the train and test data do not match, then this exception is raised. """ logger.info("Training Pipeline") # Unpack the model specifications calibration = model.specs['calibration'] directory = model.specs['directory'] drop = model.specs['drop'] extension = model.specs['extension'] feature_selection = model.specs['feature_selection'] grid_search = model.specs['grid_search'] model_type = model.specs['model_type'] predict_mode = model.specs['predict_mode'] rfe = model.specs['rfe'] sampling = model.specs['sampling'] scorer = model.specs['scorer'] separator = model.specs['separator'] target = model.specs['target'] # Get train and test data X_train, y_train = get_data(model, Partition.train) X_test, y_test = get_data(model, Partition.test) # Determine if there are any test labels if y_test.any(): logger.info("Test Labels Found") model.test_labels = True model = save_features(model, X_train, X_test, y_train, y_test) # Log feature statistics logger.info("Original Feature Statistics") logger.info("Number of Training Rows : %d", X_train.shape[0]) logger.info("Number of Training Columns : %d", X_train.shape[1]) if model_type == ModelType.classification: uv, uc = np.unique(y_train, return_counts=True) logger.info("Unique Training Values for %s : %s", target, uv) logger.info("Unique Training Counts for %s : %s", target, uc) logger.info("Number of Testing Rows : %d", X_test.shape[0]) logger.info("Number of Testing Columns : %d", X_test.shape[1]) if model_type == ModelType.classification and model.test_labels: uv, uc = np.unique(y_test, return_counts=True) logger.info("Unique Testing Values for %s : %s", target, uv) logger.info("Unique Testing Counts for %s : %s", target, uc) # Merge training and test data if X_train.shape[1] == X_test.shape[1]: split_point = X_train.shape[0] X = pd.concat([X_train, X_test]) else: raise IndexError( "The number of training and test columns [%d, %d] must match." % (X_train.shape[1], X_test.shape[1])) # Apply treatments to the feature matrix all_features = apply_treatments(model, X) # Drop features all_features = drop_features(all_features, drop) # Save the train and test files with extracted and dropped features datestamp = get_datestamp() data_dir = SSEP.join([directory, 'input']) df_train = all_features.iloc[:split_point, :] df_train = pd.concat( [df_train, pd.DataFrame(y_train, columns=[target])], axis=1) output_file = USEP.join([model.train_file, datestamp]) write_frame(df_train, data_dir, output_file, extension, separator) df_test = all_features.iloc[split_point:, :] if y_test.any(): df_test = pd.concat( [df_test, pd.DataFrame(y_test, columns=[target])], axis=1) output_file = USEP.join([model.test_file, datestamp]) write_frame(df_test, data_dir, output_file, extension, separator) # Create crosstabs for any categorical features if model_type == ModelType.classification: create_crosstabs(model) # Create initial features all_features = create_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Generate interactions all_features = create_interactions(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Remove low-variance features all_features = remove_lv_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Shuffle the data [if specified] model = shuffle_data(model) # Oversampling or Undersampling [if specified] if model_type == ModelType.classification: if sampling: model = sample_data(model) else: logger.info("Skipping Sampling") # Get sample weights (classification only) model = get_class_weights(model) # Perform feature selection, independent of algorithm if feature_selection: model = select_features(model) # Get the available classifiers and regressors logger.info("Getting All Estimators") estimators = get_estimators(model) # Get the available scorers if scorer not in scorers: raise KeyError("Scorer function %s not found" % scorer) # Model Selection logger.info("Selecting Models") for algo in model.algolist: logger.info("Algorithm: %s", algo) # select estimator try: estimator = estimators[algo] scoring = estimator.scoring est = estimator.estimator except KeyError: logger.info("Algorithm %s not found", algo) # initial fit model = first_fit(model, algo, est) # recursive feature elimination if rfe: if scoring: model = rfecv_search(model, algo) elif hasattr(est, "coef_"): model = rfe_search(model, algo) else: logger.info("No RFE Available for %s", algo) # grid search if grid_search: model = hyper_grid_search(model, estimator) # predictions model = make_predictions(model, algo, calibration) # Create a blended estimator if len(model.algolist) > 1: model = predict_blend(model) # Generate metrics model = generate_metrics(model, Partition.train) model = generate_metrics(model, Partition.test) # Store the best estimator model = predict_best(model) # Generate plots generate_plots(model, Partition.train) if model.test_labels: generate_plots(model, Partition.test) # Save best features and predictions save_model(model, 'BEST', Partition.test) # Return the model return model
def run_system(model, system, group, quantity=1): r"""Run a system for a given group, creating a trades frame. Parameters ---------- model : alphapy.Model The model object with specifications. system : alphapy.System or str The system to run, either a long/short system or a local one identified by function name, e.g., 'open_range_breakout'. group : alphapy.Group The group of symbols to test. quantity : float The amount to trade for each symbol, e.g., number of shares Returns ------- tf : pandas.DataFrame All of the trades for this ``group``. """ if system.__class__ == str: system_name = system else: system_name = system.name logger.info("Generating Trades for System %s", system_name) # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Extract the group information. gname = group.name gmembers = group.members gspace = group.space # Run the system for each member of the group gtlist = [] for symbol in gmembers: # generate the trades for this member if system.__class__ == str: try: tlist = globals()[system_name](symbol, gspace, quantity) except: logger.info("Could not execute system for %s", symbol) else: # call default long/short system tlist = long_short(system, symbol, gspace, quantity) if tlist: # create the local trades frame df = DataFrame.from_items(tlist, orient='index', columns=Trade.states) # add trades to global trade list for item in tlist: gtlist.append(item) else: logger.info("No trades for symbol %s", symbol) # Create group trades frame tf = None if gtlist: tspace = Space(system_name, "trades", group.space.fractal) gtlist = sorted(gtlist, key=lambda x: x[0]) tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states) tfname = frame_name(gname, tspace) system_dir = SSEP.join([directory, 'systems']) write_frame(tf, system_dir, tfname, extension, separator, index=True) del tspace else: logger.info("No trades were found") # Return trades frame return tf
def run_analysis(analysis, lag_period, forecast_period, leaders, predict_history, splits=True): r"""Run an analysis for a given model and group. First, the data are loaded for each member of the analysis group. Then, the target value is lagged for the ``forecast_period``, and any ``leaders`` are lagged as well. Each frame is split along the ``predict_date`` from the ``analysis``, and finally the train and test files are generated. Parameters ---------- analysis : alphapy.Analysis The analysis to run. lag_period : int The number of lagged features for the analysis. forecast_period : int The period for forecasting the target of the analysis. leaders : list The features that are contemporaneous with the target. predict_history : int The number of periods required for lookback calculations. splits : bool, optional If ``True``, then the data for each member of the analysis group are in separate files. Returns ------- analysis : alphapy.Analysis The completed analysis. """ # Unpack analysis name = analysis.name model = analysis.model group = analysis.group # Unpack model data predict_file = model.predict_file test_file = model.test_file train_file = model.train_file # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] predict_date = model.specs['predict_date'] predict_mode = model.specs['predict_mode'] separator = model.specs['separator'] target = model.specs['target'] train_date = model.specs['train_date'] # Calculate split date logger.info("Analysis Dates") split_date = subtract_days(predict_date, predict_history) logger.info("Train Date: %s", train_date) logger.info("Split Date: %s", split_date) logger.info("Test Date: %s", predict_date) # Load the data frames data_frames = load_frames(group, directory, extension, separator, splits) # Create dataframes if predict_mode: # create predict frame predict_frame = pd.DataFrame() else: # create train and test frames train_frame = pd.DataFrame() test_frame = pd.DataFrame() # Subset each individual frame and add to the master frame leaders.extend([TAG_ID]) for df in data_frames: try: tag = df[TAG_ID].unique()[0] except: tag = 'Unknown' first_date = df.index[0] last_date = df.index[-1] logger.info("Analyzing %s from %s to %s", tag, first_date, last_date) # sequence leaders, laggards, and target(s) df = sequence_frame(df, target, forecast_period, leaders, lag_period) # get frame subsets if predict_mode: new_predict = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_predict) > 0: predict_frame = predict_frame.append(new_predict) else: logger.info( "Prediction frame %s has zero rows. Check prediction date.", tag) else: # split data into train and test new_train = df.loc[(df.index >= train_date) & (df.index < split_date)] if len(new_train) > 0: new_train = new_train.dropna() train_frame = train_frame.append(new_train) new_test = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_test) > 0: # check if target column has NaN values nan_count = df[target].isnull().sum() forecast_check = forecast_period - 1 if nan_count != forecast_check: logger.info("%s has %d records with NaN targets", tag, nan_count) # drop records with NaN values in target column new_test = new_test.dropna(subset=[target]) # append selected records to the test frame test_frame = test_frame.append(new_test) else: logger.info( "Testing frame %s has zero rows. Check prediction date.", tag) else: logger.info( "Training frame %s has zero rows. Check data source.", tag) # Write out the frames for input into the AlphaPy pipeline directory = SSEP.join([directory, 'input']) if predict_mode: # write out the predict frame write_frame(predict_frame, directory, predict_file, extension, separator, index=True, index_label='date') else: # write out the train and test frames write_frame(train_frame, directory, train_file, extension, separator, index=True, index_label='date') write_frame(test_frame, directory, test_file, extension, separator, index=True, index_label='date') # Run the AlphaPy pipeline analysis.model = main_pipeline(model) # Return the analysis return analysis
def main(args=None): r"""The main program for SportFlow. Notes ----- (1) Initialize logging. (2) Parse the command line arguments. (3) Get the game configuration. (4) Get the model configuration. (5) Generate game frames for each season. (6) Create statistics for each team. (7) Merge the team frames into the final model frame. (8) Run the AlphaPy pipeline. Raises ------ ValueError Training date must be before prediction date. """ # Logging logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s", filename="sport_flow.log", filemode='a', level=logging.DEBUG, datefmt='%m/%d/%y %H:%M:%S') formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s", datefmt='%m/%d/%y %H:%M:%S') console = logging.StreamHandler() console.setFormatter(formatter) console.setLevel(logging.INFO) logging.getLogger().addHandler(console) logger = logging.getLogger(__name__) # Start the pipeline logger.info('*'*80) logger.info("SportFlow Start") logger.info('*'*80) # Argument Parsing parser = argparse.ArgumentParser(description="SportFlow Parser") parser.add_argument('--pdate', dest='predict_date', help="prediction date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_argument('--tdate', dest='train_date', help="training date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_mutually_exclusive_group(required=False) parser.add_argument('--predict', dest='predict_mode', action='store_true') parser.add_argument('--train', dest='predict_mode', action='store_false') parser.set_defaults(predict_mode=False) args = parser.parse_args() # Set train and predict dates if args.train_date: train_date = args.train_date else: train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d") if args.predict_date: predict_date = args.predict_date else: predict_date = datetime.date.today().strftime("%Y-%m-%d") # Verify that the dates are in sequence. if train_date >= predict_date: raise ValueError("Training date must be before prediction date") else: logger.info("Training Date: %s", train_date) logger.info("Prediction Date: %s", predict_date) # Read game configuration file sport_specs = get_sport_config() # Section: game league = sport_specs['league'] points_max = sport_specs['points_max'] points_min = sport_specs['points_min'] random_scoring = sport_specs['random_scoring'] seasons = sport_specs['seasons'] window = sport_specs['rolling_window'] # Read model configuration file specs = get_model_config() # Add command line arguments to model specifications specs['predict_mode'] = args.predict_mode specs['predict_date'] = args.predict_date specs['train_date'] = args.train_date # Unpack model arguments directory = specs['directory'] target = specs['target'] # Create directories if necessary output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots'] for od in output_dirs: output_dir = SSEP.join([directory, od]) if not os.path.exists(output_dir): logger.info("Creating directory %s", output_dir) os.makedirs(output_dir) # Create the game scores space space = Space('game', 'scores', '1g') # # Derived Variables # series = space.schema team1_prefix = 'home' team2_prefix = 'away' home_team = PSEP.join([team1_prefix, 'team']) away_team = PSEP.join([team2_prefix, 'team']) # # Read in the game frame. This is the feature generation phase. # logger.info("Reading Game Data") data_dir = SSEP.join([directory, 'data']) file_base = USEP.join([league, space.subject, space.schema, space.fractal]) df = read_frame(data_dir, file_base, specs['extension'], specs['separator']) logger.info("Total Game Records: %d", df.shape[0]) # # Locate any rows with null values # null_rows = df.isnull().any(axis=1) null_indices = [i for i, val in enumerate(null_rows.tolist()) if val == True] for i in null_indices: logger.info("Null Record: %d on Date: %s", i, df.date[i]) # # Run the game pipeline on a seasonal loop # if not seasons: # run model on all seasons seasons = df['season'].unique().tolist() # # Initialize the final frame # ff = pd.DataFrame() # # Iterate through each season of the game frame # for season in seasons: # Generate a frame for each season gf = df[df['season'] == season] gf = gf.reset_index() # Generate derived variables for the game frame total_games = gf.shape[0] if random_scoring: gf['home.score'] = np.random.randint(points_min, points_max, total_games) gf['away.score'] = np.random.randint(points_min, points_max, total_games) gf['total_points'] = gf['home.score'] + gf['away.score'] gf = add_features(gf, game_dict, gf.shape[0]) for index, row in gf.iterrows(): gf['point_margin_game'].at[index] = get_point_margin(row, 'home.score', 'away.score') gf['won_on_points'].at[index] = True if gf['point_margin_game'].at[index] > 0 else False gf['lost_on_points'].at[index] = True if gf['point_margin_game'].at[index] < 0 else False gf['cover_margin_game'].at[index] = gf['point_margin_game'].at[index] + row['line'] gf['won_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] > 0 else False gf['lost_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] <= 0 else False gf['overunder_margin'].at[index] = gf['total_points'].at[index] - row['over_under'] gf['over'].at[index] = True if gf['overunder_margin'].at[index] > 0 else False gf['under'].at[index] = True if gf['overunder_margin'].at[index] < 0 else False # Generate each team frame team_frames = {} teams = gf.groupby([home_team]) for team, data in teams: team_frame = USEP.join([league, team.lower(), series, str(season)]) logger.info("Generating team frame: %s", team_frame) tf = get_team_frame(gf, team, home_team, away_team) tf = tf.reset_index() tf = generate_team_frame(team, tf, home_team, away_team, window) team_frames[team_frame] = tf # Create the model frame, initializing the home and away frames mdict = {k:v for (k,v) in list(sports_dict.items()) if v != bool} team1_frame = pd.DataFrame() team1_frame = add_features(team1_frame, mdict, gf.shape[0], prefix=team1_prefix) team2_frame = pd.DataFrame() team2_frame = add_features(team2_frame, mdict, gf.shape[0], prefix=team2_prefix) frames = [gf, team1_frame, team2_frame] mf = pd.concat(frames, axis=1) # Loop through each team frame, inserting data into the model frame row # get index+1 [if valid] # determine if team is home or away to get prefix # try: np.where((gf[home_team] == 'PHI') & (gf['date'] == '09/07/14'))[0][0] # Assign team frame fields to respective model frame fields: set gf.at(pos, field) for team, data in teams: team_frame = USEP.join([league, team.lower(), series, str(season)]) logger.info("Merging team frame %s into model frame", team_frame) tf = team_frames[team_frame] for index in range(0, tf.shape[0]-1): gindex = index + 1 model_row = tf.iloc[gindex] key_date = model_row['date'] at_home = False if team == model_row[home_team]: at_home = True key_team = model_row[home_team] elif team == model_row[away_team]: key_team = model_row[away_team] else: raise KeyError("Team %s not found in Team Frame" % team) try: if at_home: mpos = np.where((mf[home_team] == key_team) & (mf['date'] == key_date))[0][0] else: mpos = np.where((mf[away_team] == key_team) & (mf['date'] == key_date))[0][0] except: raise IndexError("Team/Date Key not found in Model Frame") # print team, gindex, mpos # insert team data into model row mf = insert_model_data(mf, mpos, mdict, tf, index, team1_prefix if at_home else team2_prefix) # Compute delta data 'home' - 'away' mf = generate_delta_data(mf, mdict, team1_prefix, team2_prefix) # Append this to final frame frames = [ff, mf] ff = pd.concat(frames) # Write out dataframes input_dir = SSEP.join([directory, 'input']) if args.predict_mode: new_predict_frame = ff.loc[ff.date >= predict_date] if len(new_predict_frame) <= 1: raise ValueError("Prediction frame has length 1 or less") # rewrite with all the features to the train and test files logger.info("Saving prediction frame") write_frame(new_predict_frame, input_dir, datasets[Partition.predict], specs['extension'], specs['separator']) else: # split data into training and test data new_train_frame = ff.loc[(ff.date >= train_date) & (ff.date < predict_date)] if len(new_train_frame) <= 1: raise ValueError("Training frame has length 1 or less") new_test_frame = ff.loc[ff.date >= predict_date] if len(new_test_frame) <= 1: raise ValueError("Testing frame has length 1 or less") # rewrite with all the features to the train and test files logger.info("Saving training frame") write_frame(new_train_frame, input_dir, datasets[Partition.train], specs['extension'], specs['separator']) logger.info("Saving testing frame") write_frame(new_test_frame, input_dir, datasets[Partition.test], specs['extension'], specs['separator']) # Create the model from specs logger.info("Running Model") model = Model(specs) # Run the pipeline model = main_pipeline(model) # Complete the pipeline logger.info('*'*80) logger.info("SportFlow End") logger.info('*'*80)
def gen_portfolio(model, system, group, tframe, startcap=100000, posby='close'): r"""Create a portfolio from a trades frame. Parameters ---------- model : alphapy.Model The model with specifications. system : str Name of the system. group : alphapy.Group The group of instruments in the portfolio. tframe : pandas.DataFrame The input trade list from running the system. startcap : float Starting capital. posby : str The position sizing column in the price dataframe. Returns ------- p : alphapy.Portfolio The generated portfolio. Raises ------ MemoryError Could not allocate Portfolio. Notes ----- This function also generates the files required for analysis by the *pyfolio* package: * Returns File * Positions File * Transactions File """ logger.info("Creating Portfolio for System %s", system) # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Create the portfolio. gname = group.name gspace = group.space gmembers = group.members ff = 1.0 / len(gmembers) p = Portfolio(gname, system, gspace, startcap=startcap, posby=posby, restricted=False, fixedfrac=ff) if not p: raise MemoryError("Could not allocate Portfolio") # Build pyfolio data from the trades frame. start = tframe.index[0] end = tframe.index[-1] trange = np.unique( tframe.index.map(lambda x: x.date().strftime('%Y-%m-%d'))).tolist() drange = date_range(start, end).map(lambda x: x.date().strftime('%Y-%m-%d')) # Initialize return, position, and transaction data. rs = [] pcols = list(gmembers) pcols.extend(['cash']) pf = DataFrame(index=drange, columns=pcols).fillna(0.0) ts = [] # Iterate through the date range, updating the portfolio. for d in drange: # process today's trades if d in trange: trades = tframe.ix[d] if isinstance(trades, Series): trades = DataFrame(trades).transpose() for t in trades.iterrows(): tdate = t[0] row = t[1] tsize = exec_trade(p, row['name'], row['order'], row['quantity'], row['price'], tdate) if tsize != 0: ts.append((d, [tsize, row['price'], row['name']])) else: logger.info("Trade could not be executed for %s", row['name']) # iterate through current positions positions = p.positions pfrow = pf.ix[d] for key in positions: pos = positions[key] if pos.quantity > 0: value = pos.value else: value = -pos.value pfrow[pos.name] = value pfrow['cash'] = p.cash # update the portfolio returns p = valuate_portfolio(p, d) rs.append((d, [p.netreturn])) # Create systems directory path system_dir = SSEP.join([directory, 'systems']) # Create and record the returns frame for this system. logger.info("Recording Returns Frame") rspace = Space(system, 'returns', gspace.fractal) rf = DataFrame.from_items(rs, orient='index', columns=['return']) rfname = frame_name(gname, rspace) write_frame(rf, system_dir, rfname, extension, separator, index=True, index_label='date') del rspace # Record the positions frame for this system. logger.info("Recording Positions Frame") pspace = Space(system, 'positions', gspace.fractal) pfname = frame_name(gname, pspace) write_frame(pf, system_dir, pfname, extension, separator, index=True, index_label='date') del pspace # Create and record the transactions frame for this system. logger.info("Recording Transactions Frame") tspace = Space(system, 'transactions', gspace.fractal) tf = DataFrame.from_items(ts, orient='index', columns=['amount', 'price', 'symbol']) tfname = frame_name(gname, tspace) write_frame(tf, system_dir, tfname, extension, separator, index=True, index_label='date') del tspace # Return the portfolio. return p
def run_analysis(analysis, forecast_period, leaders, predict_history, splits=True): r"""Run an analysis for a given model and group. First, the data are loaded for each member of the analysis group. Then, the target value is lagged for the ``forecast_period``, and any ``leaders`` are lagged as well. Each frame is split along the ``predict_date`` from the ``analysis``, and finally the train and test files are generated. Parameters ---------- analysis : alphapy.Analysis The analysis to run. forecast_period : int The period for forecasting the target of the analysis. leaders : list The features that are contemporaneous with the target. splits : bool, optional If ``True``, then the data for each member of the analysis group are in separate files. Returns ------- analysis : alphapy.Analysis The completed analysis. """ # Unpack analysis name = analysis.name model = analysis.model group = analysis.group # Unpack model data predict_file = model.predict_file test_file = model.test_file test_labels = model.test_labels train_file = model.train_file # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] predict_date = model.specs['predict_date'] predict_mode = model.specs['predict_mode'] separator = model.specs['separator'] target = model.specs['target'] train_date = model.specs['train_date'] # Calculate split date split_date = subtract_days(predict_date, predict_history) # Load the data frames data_frames = load_frames(group, directory, extension, separator, splits) # Create dataframes if predict_mode: # create predict frame predict_frame = pd.DataFrame() else: # create train and test frames train_frame = pd.DataFrame() test_frame = pd.DataFrame() # Subset each individual frame and add to the master frame for df in data_frames: last_date = df.index[-1] # shift the target for the forecast period if forecast_period > 0: df[target] = df[target].shift(-forecast_period) df.index = df.index.shift(forecast_period, freq='D') # shift any leading features if necessary if leaders: df[leaders] = df[leaders].shift(-1) # get frame subsets if predict_mode: new_predict = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_predict) > 0: predict_frame = predict_frame.append(new_predict) else: logger.info( "A prediction frame has zero rows. Check prediction date.") else: # split data into train and test new_train = df.loc[(df.index >= train_date) & (df.index < split_date)] if len(new_train) > 0: # train frame new_train = new_train.dropna() train_frame = train_frame.append(new_train) # test frame new_test = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_test) > 0: if test_labels: new_test = new_test.dropna() test_frame = test_frame.append(new_test) else: logger.info( "A testing frame has zero rows. Check prediction date." ) else: logger.warning( "A training frame has zero rows. Check data source.") # Write out the frames for input into the AlphaPy pipeline directory = SSEP.join([directory, 'input']) if predict_mode: # write out the predict frame write_frame(predict_frame, directory, predict_file, extension, separator, index=True, index_label='date') else: # write out the train and test frames write_frame(train_frame, directory, train_file, extension, separator, index=True, index_label='date') write_frame(test_frame, directory, test_file, extension, separator, index=True, index_label='date') # Run the AlphaPy pipeline analysis.model = main_pipeline(model) # Return the analysis return analysis