def save_predictions(model, tag, partition): r"""Save the predictions to disk. Parameters ---------- model : alphapy.Model The model object to save. tag : str A unique identifier for the output files, e.g., a date stamp. partition : alphapy.Partition Reference to the dataset. Returns ------- preds : numpy array The prediction vector. probas : numpy array The probability vector. """ # Extract model parameters. directory = model.specs['directory'] extension = model.specs['extension'] model_type = model.specs['model_type'] separator = model.specs['separator'] # Get date stamp to record file creation timestamp = get_datestamp() # Specify input and output directories input_dir = SSEP.join([directory, 'input']) output_dir = SSEP.join([directory, 'output']) # Read the prediction frame file_spec = ''.join([datasets[partition], '*']) file_name = most_recent_file(input_dir, file_spec) file_name = file_name.split(SSEP)[-1].split(PSEP)[0] pf = read_frame(input_dir, file_name, extension, separator) # Cull records before the prediction date try: predict_date = model.specs['predict_date'] found_pdate = True except: found_pdate = False if found_pdate: pd_indices = pf[pf.date >= predict_date].index.tolist() pf = pf.iloc[pd_indices] else: pd_indices = pf.index.tolist() # Save predictions for all projects logger.info("Saving Predictions") output_file = USEP.join(['predictions', timestamp]) preds = model.preds[(tag, partition)].squeeze() if found_pdate: preds = np.take(preds, pd_indices) pred_series = pd.Series(preds, index=pd_indices) df_pred = pd.DataFrame(pred_series, columns=['prediction']) write_frame(df_pred, output_dir, output_file, extension, separator) # Save probabilities for classification projects probas = None if model_type == ModelType.classification: logger.info("Saving Probabilities") output_file = USEP.join(['probabilities', timestamp]) probas = model.probas[(tag, partition)].squeeze() if found_pdate: probas = np.take(probas, pd_indices) prob_series = pd.Series(probas, index=pd_indices) df_prob = pd.DataFrame(prob_series, columns=['probability']) write_frame(df_prob, output_dir, output_file, extension, separator) # Save ranked predictions logger.info("Saving Ranked Predictions") pf['prediction'] = pred_series if model_type == ModelType.classification: pf['probability'] = prob_series pf.sort_values('probability', ascending=False, inplace=True) else: pf.sort_values('prediction', ascending=False, inplace=True) output_file = USEP.join(['rankings', timestamp]) write_frame(pf, output_dir, output_file, extension, separator) # Return predictions and any probabilities return preds, probas
def main(args=None): r"""The main program for SportFlow. Notes ----- (1) Initialize logging. (2) Parse the command line arguments. (3) Get the game configuration. (4) Get the model configuration. (5) Generate game frames for each season. (6) Create statistics for each team. (7) Merge the team frames into the final model frame. (8) Run the AlphaPy pipeline. Raises ------ ValueError Training date must be before prediction date. """ # Logging logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s", filename="sport_flow.log", filemode='a', level=logging.DEBUG, datefmt='%m/%d/%y %H:%M:%S') formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s", datefmt='%m/%d/%y %H:%M:%S') console = logging.StreamHandler() console.setFormatter(formatter) console.setLevel(logging.INFO) logging.getLogger().addHandler(console) logger = logging.getLogger(__name__) # Start the pipeline logger.info('*'*80) logger.info("SportFlow Start") logger.info('*'*80) # Argument Parsing parser = argparse.ArgumentParser(description="SportFlow Parser") parser.add_argument('--pdate', dest='predict_date', help="prediction date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_argument('--tdate', dest='train_date', help="training date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_mutually_exclusive_group(required=False) parser.add_argument('--predict', dest='predict_mode', action='store_true') parser.add_argument('--train', dest='predict_mode', action='store_false') parser.set_defaults(predict_mode=False) args = parser.parse_args() # Set train and predict dates if args.train_date: train_date = args.train_date else: train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d") if args.predict_date: predict_date = args.predict_date else: predict_date = datetime.date.today().strftime("%Y-%m-%d") # Verify that the dates are in sequence. if train_date >= predict_date: raise ValueError("Training date must be before prediction date") else: logger.info("Training Date: %s", train_date) logger.info("Prediction Date: %s", predict_date) # Read game configuration file sport_specs = get_sport_config() # Section: game league = sport_specs['league'] points_max = sport_specs['points_max'] points_min = sport_specs['points_min'] random_scoring = sport_specs['random_scoring'] seasons = sport_specs['seasons'] window = sport_specs['rolling_window'] # Read model configuration file specs = get_model_config() # Add command line arguments to model specifications specs['predict_mode'] = args.predict_mode specs['predict_date'] = args.predict_date specs['train_date'] = args.train_date # Unpack model arguments directory = specs['directory'] target = specs['target'] # Create directories if necessary output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots'] for od in output_dirs: output_dir = SSEP.join([directory, od]) if not os.path.exists(output_dir): logger.info("Creating directory %s", output_dir) os.makedirs(output_dir) # Create the game scores space space = Space('game', 'scores', '1g') # # Derived Variables # series = space.schema team1_prefix = 'home' team2_prefix = 'away' home_team = PSEP.join([team1_prefix, 'team']) away_team = PSEP.join([team2_prefix, 'team']) # # Read in the game frame. This is the feature generation phase. # logger.info("Reading Game Data") data_dir = SSEP.join([directory, 'data']) file_base = USEP.join([league, space.subject, space.schema, space.fractal]) df = read_frame(data_dir, file_base, specs['extension'], specs['separator']) logger.info("Total Game Records: %d", df.shape[0]) # # Locate any rows with null values # null_rows = df.isnull().any(axis=1) null_indices = [i for i, val in enumerate(null_rows.tolist()) if val == True] for i in null_indices: logger.info("Null Record: %d on Date: %s", i, df.date[i]) # # Run the game pipeline on a seasonal loop # if not seasons: # run model on all seasons seasons = df['season'].unique().tolist() # # Initialize the final frame # ff = pd.DataFrame() # # Iterate through each season of the game frame # for season in seasons: # Generate a frame for each season gf = df[df['season'] == season] gf = gf.reset_index() # Generate derived variables for the game frame total_games = gf.shape[0] if random_scoring: gf['home.score'] = np.random.randint(points_min, points_max, total_games) gf['away.score'] = np.random.randint(points_min, points_max, total_games) gf['total_points'] = gf['home.score'] + gf['away.score'] gf = add_features(gf, game_dict, gf.shape[0]) for index, row in gf.iterrows(): gf['point_margin_game'].at[index] = get_point_margin(row, 'home.score', 'away.score') gf['won_on_points'].at[index] = True if gf['point_margin_game'].at[index] > 0 else False gf['lost_on_points'].at[index] = True if gf['point_margin_game'].at[index] < 0 else False gf['cover_margin_game'].at[index] = gf['point_margin_game'].at[index] + row['line'] gf['won_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] > 0 else False gf['lost_on_spread'].at[index] = True if gf['cover_margin_game'].at[index] <= 0 else False gf['overunder_margin'].at[index] = gf['total_points'].at[index] - row['over_under'] gf['over'].at[index] = True if gf['overunder_margin'].at[index] > 0 else False gf['under'].at[index] = True if gf['overunder_margin'].at[index] < 0 else False # Generate each team frame team_frames = {} teams = gf.groupby([home_team]) for team, data in teams: team_frame = USEP.join([league, team.lower(), series, str(season)]) logger.info("Generating team frame: %s", team_frame) tf = get_team_frame(gf, team, home_team, away_team) tf = tf.reset_index() tf = generate_team_frame(team, tf, home_team, away_team, window) team_frames[team_frame] = tf # Create the model frame, initializing the home and away frames mdict = {k:v for (k,v) in list(sports_dict.items()) if v != bool} team1_frame = pd.DataFrame() team1_frame = add_features(team1_frame, mdict, gf.shape[0], prefix=team1_prefix) team2_frame = pd.DataFrame() team2_frame = add_features(team2_frame, mdict, gf.shape[0], prefix=team2_prefix) frames = [gf, team1_frame, team2_frame] mf = pd.concat(frames, axis=1) # Loop through each team frame, inserting data into the model frame row # get index+1 [if valid] # determine if team is home or away to get prefix # try: np.where((gf[home_team] == 'PHI') & (gf['date'] == '09/07/14'))[0][0] # Assign team frame fields to respective model frame fields: set gf.at(pos, field) for team, data in teams: team_frame = USEP.join([league, team.lower(), series, str(season)]) logger.info("Merging team frame %s into model frame", team_frame) tf = team_frames[team_frame] for index in range(0, tf.shape[0]-1): gindex = index + 1 model_row = tf.iloc[gindex] key_date = model_row['date'] at_home = False if team == model_row[home_team]: at_home = True key_team = model_row[home_team] elif team == model_row[away_team]: key_team = model_row[away_team] else: raise KeyError("Team %s not found in Team Frame" % team) try: if at_home: mpos = np.where((mf[home_team] == key_team) & (mf['date'] == key_date))[0][0] else: mpos = np.where((mf[away_team] == key_team) & (mf['date'] == key_date))[0][0] except: raise IndexError("Team/Date Key not found in Model Frame") # print team, gindex, mpos # insert team data into model row mf = insert_model_data(mf, mpos, mdict, tf, index, team1_prefix if at_home else team2_prefix) # Compute delta data 'home' - 'away' mf = generate_delta_data(mf, mdict, team1_prefix, team2_prefix) # Append this to final frame frames = [ff, mf] ff = pd.concat(frames) # Write out dataframes input_dir = SSEP.join([directory, 'input']) if args.predict_mode: new_predict_frame = ff.loc[ff.date >= predict_date] if len(new_predict_frame) <= 1: raise ValueError("Prediction frame has length 1 or less") # rewrite with all the features to the train and test files logger.info("Saving prediction frame") write_frame(new_predict_frame, input_dir, datasets[Partition.predict], specs['extension'], specs['separator']) else: # split data into training and test data new_train_frame = ff.loc[(ff.date >= train_date) & (ff.date < predict_date)] if len(new_train_frame) <= 1: raise ValueError("Training frame has length 1 or less") new_test_frame = ff.loc[ff.date >= predict_date] if len(new_test_frame) <= 1: raise ValueError("Testing frame has length 1 or less") # rewrite with all the features to the train and test files logger.info("Saving training frame") write_frame(new_train_frame, input_dir, datasets[Partition.train], specs['extension'], specs['separator']) logger.info("Saving testing frame") write_frame(new_test_frame, input_dir, datasets[Partition.test], specs['extension'], specs['separator']) # Create the model from specs logger.info("Running Model") model = Model(specs) # Run the pipeline model = main_pipeline(model) # Complete the pipeline logger.info('*'*80) logger.info("SportFlow End") logger.info('*'*80)
def trade_system(model, system, space, intraday, name, quantity): r"""Trade the given system. Parameters ---------- model : alphapy.Model The model object with specifications. system : alphapy.System The long/short system to run. space : alphapy.Space Namespace of instrument prices. intraday : bool If True, then run an intraday system. name : str The symbol to trade. quantity : float The amount of the ``name`` to trade, e.g., number of shares Returns ------- tradelist : list List of trade entries and exits. Other Parameters ---------------- Frame.frames : dict All of the data frames containing price data. """ # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Unpack the system parameters. longentry = system.longentry shortentry = system.shortentry longexit = system.longexit shortexit = system.shortexit holdperiod = system.holdperiod scale = system.scale # Determine whether or not this is a model-driven system. entries_and_exits = [longentry, shortentry, longexit, shortexit] active_signals = [x for x in entries_and_exits if x is not None] use_model = False for signal in active_signals: if any(x in signal for x in ['phigh', 'plow']): use_model = True # Read in the price frame pf = Frame.frames[frame_name(name, space)].df # Use model output probabilities as input to the system if use_model: # get latest probabilities file probs_dir = SSEP.join([directory, 'output']) file_path = most_recent_file(probs_dir, 'probabilities*') file_name = file_path.split(SSEP)[-1].split('.')[0] # read the probabilities frame and trim the price frame probs_frame = read_frame(probs_dir, file_name, extension, separator) pf = pf[-probs_frame.shape[0]:] probs_frame.index = pf.index probs_frame.columns = ['probability'] # add probability column to price frame pf = pd.concat([pf, probs_frame], axis=1) # Evaluate the long and short events in the price frame for signal in active_signals: vexec(pf, signal) # Initialize trading state variables inlong = False inshort = False h = 0 p = 0 q = quantity tradelist = [] # Loop through prices and generate trades for dt, row in pf.iterrows(): # get closing price c = row['close'] if intraday: bar_number = row['bar_number'] end_of_day = row['end_of_day'] # evaluate entry and exit conditions lerow = row[longentry] if longentry else None serow = row[shortentry] if shortentry else None lxrow = row[longexit] if longexit else None sxrow = row[shortexit] if shortexit else None # process the long and short events if lerow: if p < 0: # short active, so exit short tradelist.append((dt, [name, Orders.sx, -p, c])) inshort = False h = 0 p = 0 if p == 0 or scale: # go long (again) tradelist.append((dt, [name, Orders.le, q, c])) inlong = True p = p + q elif serow: if p > 0: # long active, so exit long tradelist.append((dt, [name, Orders.lx, -p, c])) inlong = False h = 0 p = 0 if p == 0 or scale: # go short (again) tradelist.append((dt, [name, Orders.se, -q, c])) inshort = True p = p - q # check exit conditions if inlong and h > 0 and lxrow: # long active, so exit long tradelist.append((dt, [name, Orders.lx, -p, c])) inlong = False h = 0 p = 0 if inshort and h > 0 and sxrow: # short active, so exit short tradelist.append((dt, [name, Orders.sx, -p, c])) inshort = False h = 0 p = 0 # if a holding period was given, then check for exit if holdperiod and h >= holdperiod: if inlong: tradelist.append((dt, [name, Orders.lh, -p, c])) inlong = False if inshort: tradelist.append((dt, [name, Orders.sh, -p, c])) inshort = False h = 0 p = 0 # increment the hold counter if inlong or inshort: h += 1 if intraday and end_of_day: if inlong: # long active, so exit long tradelist.append((dt, [name, Orders.lx, -p, c])) inlong = False if inshort: # short active, so exit short tradelist.append((dt, [name, Orders.sx, -p, c])) inshort = False h = 0 p = 0 return tradelist
def get_data(model, partition): r"""Get data for the given partition. Parameters ---------- model : alphapy.Model The model object describing the data. partition : alphapy.Partition Reference to the dataset. Returns ------- X : pandas.DataFrame The feature set. y : pandas.Series The array of target values, if available. """ logger.info("Loading Data") # Extract the model data directory = model.specs['directory'] extension = model.specs['extension'] features = model.specs['features'] model_type = model.specs['model_type'] separator = model.specs['separator'] target = model.specs['target'] test_file = model.test_file train_file = model.train_file # Read in the file filename = datasets[partition] input_dir = SSEP.join([directory, 'input']) df = read_frame(input_dir, filename, extension, separator) # Assign target and drop it if necessary y = np.empty([0, 0]) if target in df.columns: logger.info("Found target %s in data frame", target) # check if target column has NaN values nan_count = df[target].isnull().sum() if nan_count > 0: logger.info("Found %d records with NaN target values", nan_count) logger.info("Labels (y) for %s will not be used", partition) else: # assign the target column to y y = df[target] # encode label only for classification if model_type == ModelType.classification: y = LabelEncoder().fit_transform(y) logger.info("Labels (y) found for %s", partition) # drop the target from the original frame df = df.drop([target], axis=1) else: logger.info("Target %s not found in %s", target, partition) # Extract features if features == WILDCARD: X = df else: X = df[features] # Labels are returned usually only for training data return X, y
def get_market_data(model, group, lookback_period, resample_data): r"""Get data from an external feed. Parameters ---------- model : alphapy.Model The model object describing the data. group : alphapy.Group The group of symbols. lookback_period : int The number of periods of data to retrieve. Returns ------- n_periods : int The maximum number of periods actually retrieved. """ # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Unpack group elements gspace = group.space schema = gspace.schema fractal = gspace.fractal # Determine the feed source if any(substring in fractal for substring in PD_INTRADAY_OFFSETS): # intraday data (date and time) logger.info("Getting Intraday Data [%s] from %s", fractal, schema) intraday_data = True index_column = 'datetime' else: # daily data or higher (date only) logger.info("Getting Daily Data [%s] from %s", fractal, schema) intraday_data = False index_column = 'date' # Get the data from the relevant feed data_dir = SSEP.join([directory, 'data']) pandas_data = any(substring in schema for substring in PD_WEB_DATA_FEEDS) n_periods = 0 for item in group.members: logger.info("Getting %s data for last %d days", item, lookback_period) # Locate the data source if schema == 'data': fname = frame_name(item.lower(), gspace) df = read_frame(data_dir, fname, extension, separator) if not intraday_data: df.set_index(pd.DatetimeIndex(df[index_column]), drop=True, inplace=True) elif schema == 'google' and intraday_data: df = get_google_data(item, lookback_period, fractal) elif pandas_data: df = get_pandas_data(schema, item, lookback_period) else: logger.error("Unsupported Data Source: %s", schema) # Now that we have content, standardize the data if df is not None and not df.empty: logger.info("Rows: %d", len(df)) # standardize column names df = df.rename(columns=lambda x: x.lower().replace(' ', '')) # add intraday columns if necessary if intraday_data: df = enhance_intraday_data(df) # order by increasing date if necessary df = df.sort_index() # resample data if resample_data: df = df.resample(fractal).agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' }) logger.info("Rows after Resampling at %s: %d", fractal, len(df)) # allocate global Frame newf = Frame(item.lower(), gspace, df) if newf is None: logger.error("Could not allocate Frame for: %s", item) # calculate maximum number of periods df_len = len(df) if df_len > n_periods: n_periods = df_len else: logger.info("No DataFrame for %s", item) # The number of periods actually retrieved return n_periods
def get_market_data(model, group, lookback_period, data_fractal, intraday_data=False): r"""Get data from an external feed. Parameters ---------- model : alphapy.Model The model object describing the data. group : alphapy.Group The group of symbols. lookback_period : int The number of periods of data to retrieve. data_fractal : str Pandas offset alias. intraday_data : bool If True, then get intraday data. Returns ------- n_periods : int The maximum number of periods actually retrieved. """ # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Unpack group elements gspace = group.space schema = gspace.schema fractal = gspace.fractal # Determine the feed source if intraday_data: # intraday data (date and time) logger.info("Getting Intraday Data [%s] from %s", data_fractal, schema) index_column = 'datetime' else: # daily data or higher (date only) logger.info("Getting Daily Data [%s] from %s", data_fractal, schema) index_column = 'date' # Get the data from the relevant feed data_dir = SSEP.join([directory, 'data']) pandas_data = any(substring in schema for substring in PD_WEB_DATA_FEEDS) n_periods = 0 resample_data = True if fractal != data_fractal else False df = None to_date = pd.to_datetime('today') from_date = to_date - pd.to_timedelta(lookback_period, unit='d') for item in group.members: logger.info("Getting %s data for last %d days", item, lookback_period) # Locate the data source if schema == 'data': # local intraday or daily dspace = Space(gspace.subject, gspace.schema, data_fractal) fname = frame_name(item.lower(), dspace) df = read_frame(data_dir, fname, extension, separator) elif schema == 'google' and intraday_data: # intraday only df = get_google_data(item, lookback_period, data_fractal) elif pandas_data: # daily only df = get_pandas_data(schema, item, lookback_period) else: logger.error("Unsupported Data Source: %s", schema) # Now that we have content, standardize the data if df is not None and not df.empty: logger.info("%d data points from %s to %s", len(df), from_date, to_date) # convert data to canonical form df = convert_data(df, index_column, intraday_data) # resample data and forward fill any NA values if resample_data: df = df.resample(fractal).agg({'open' : 'first', 'high' : 'max', 'low' : 'min', 'close' : 'last', 'volume' : 'sum'}) df.dropna(axis=0, how='any', inplace=True) logger.info("Rows after Resampling at %s: %d", fractal, len(df)) # add intraday columns if necessary if intraday_data: df = enhance_intraday_data(df) # allocate global Frame newf = Frame(item.lower(), gspace, df) if newf is None: logger.error("Could not allocate Frame for: %s", item) # calculate maximum number of periods df_len = len(df) if df_len > n_periods: n_periods = df_len else: logger.info("No DataFrame for %s", item) # The number of periods actually retrieved return n_periods
def get_data(model, partition): r"""Get data for the given partition. Parameters ---------- model : alphapy.Model The model object describing the data. partition : alphapy.Partition Reference to the dataset. Returns ------- X : pandas.DataFrame The feature set. y : pandas.Series The array of target values, if available. Raises ------ ValueError Found test labels with NaN values. """ logger.info("Loading Data") # Extract the model data directory = model.specs['directory'] extension = model.specs['extension'] features = model.specs['features'] model_type = model.specs['model_type'] separator = model.specs['separator'] target = model.specs['target'] test_file = model.test_file train_file = model.train_file # Read in the file filename = datasets[partition] input_dir = SSEP.join([directory, 'input']) df = read_frame(input_dir, filename, extension, separator) # Assign target and drop it if necessary y = np.empty([0, 0]) if target in df.columns: logger.info("Found target %s in data frame", target) # drop rows with NaN targets original_size = df.shape[0] df.dropna(axis=0, subset=[target], inplace=True) diff = original_size - df.shape[0] if diff > 0: raise ValueError("Found %d records in %s with NaN target values" % (diff, partition)) # assign the target column to y y = df[target] # encode label only for classification if model_type == ModelType.classification: y = LabelEncoder().fit_transform(y) # drop the target as it has already been extracted into y logger.info("Dropping target %s from data frame", target) df = df.drop([target], axis=1) else: logger.info("Target %s not found in %s", target, partition) # Extract features if features == WILDCARD: X = df else: X = df[features] # Labels are returned usually only for training data return X, y
def get_market_data(model, market_specs, group, lookback_period, intraday_data=False): r"""Get data from an external feed. Parameters ---------- model : alphapy.Model The model object describing the data. market_specs : dict The specifications for controlling the MarketFlow pipeline. group : alphapy.Group The group of symbols. lookback_period : int The number of periods of data to retrieve. intraday_data : bool If True, then get intraday data. Returns ------- n_periods : int The maximum number of periods actually retrieved. """ # Unpack market specifications data_fractal = market_specs['data_fractal'] subschema = market_specs['subschema'] # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Unpack group elements gspace = group.space schema = gspace.schema fractal = gspace.fractal # Determine the feed source if intraday_data: # intraday data (date and time) logger.info("%s Intraday Data [%s] for %d periods", schema, data_fractal, lookback_period) index_column = 'datetime' else: # daily data or higher (date only) logger.info("%s Daily Data [%s] for %d periods", schema, data_fractal, lookback_period) index_column = 'date' # Get the data from the relevant feed data_dir = SSEP.join([directory, 'data']) n_periods = 0 resample_data = True if fractal != data_fractal else False # Date Arithmetic to_date = pd.to_datetime('today') from_date = to_date - pd.to_timedelta(lookback_period, unit='d') to_date = to_date.strftime('%Y-%m-%d') from_date = from_date.strftime('%Y-%m-%d') # Get the data from the specified data feed df = pd.DataFrame() for symbol in group.members: logger.info("Getting %s data from %s to %s", symbol.upper(), from_date, to_date) # Locate the data source if schema == 'data': # local intraday or daily dspace = Space(gspace.subject, gspace.schema, data_fractal) fname = frame_name(symbol.lower(), dspace) df = read_frame(data_dir, fname, extension, separator) elif schema in data_dispatch_table.keys(): df = data_dispatch_table[schema](schema, subschema, symbol, intraday_data, data_fractal, from_date, to_date, lookback_period) else: logger.error("Unsupported Data Source: %s", schema) # Now that we have content, standardize the data if not df.empty: logger.info("Rows: %d [%s]", len(df), data_fractal) # convert data to canonical form df = convert_data(df, index_column, intraday_data) # resample data and forward fill any NA values if resample_data: df = df.resample(fractal).agg({ 'open': 'first', 'high': 'max', 'low': 'min', 'close': 'last', 'volume': 'sum' }) df.dropna(axis=0, how='any', inplace=True) logger.info("Rows after Resampling at %s: %d", fractal, len(df)) # add intraday columns if necessary if intraday_data: df = enhance_intraday_data(df) # allocate global Frame newf = Frame(symbol.lower(), gspace, df) if newf is None: logger.error("Could not allocate Frame for: %s", symbol.upper()) # calculate maximum number of periods df_len = len(df) if df_len > n_periods: n_periods = df_len else: logger.info("No DataFrame for %s", symbol.upper()) # The number of periods actually retrieved return n_periods