def load_predictor(directory): r"""Load the model predictor from storage. By default, the most recent model is loaded into memory. Parameters ---------- directory : str Full directory specification of the predictor's location. Returns ------- predictor : function The scoring function. """ # Locate the model Pickle file try: search_dir = SSEP.join([directory, 'model']) file_name = most_recent_file(search_dir, 'model_*.pkl') logger.info("Loading model predictor from %s", file_name) # load the model predictor predictor = joblib.load(file_name) except: logging.error("Could not find model predictor in %s", search_path) # Return the model predictor return predictor
def save_predictor(model, timestamp): r"""Save the time-stamped model predictor to disk. Parameters ---------- model : alphapy.Model The model object that contains the best estimator. timestamp : str Date in yyyy-mm-dd format. Returns ------- None : None """ logger.info("Saving Model Predictor") # Extract model parameters. directory = model.specs['directory'] # Get the best predictor predictor = model.estimators['BEST'] # Create full path name. filename = 'model_' + timestamp + '.pkl' full_path = SSEP.join([directory, 'model', filename]) # Save model object logger.info("Writing model predictor to %s", full_path) joblib.dump(predictor, full_path)
def write_frame(df, directory, filename, extension, separator, index=False, index_label=None): r"""Write a dataframe into a delimiter-separated file. Parameters ---------- df : pandas.DataFrame The pandas dataframe to save to a file. directory : str Full directory specification. filename : str Name of the file to write, excluding the ``extension``. extension : str File name extension, e.g., ``csv``. separator : str The delimiter between fields in the file. index : bool, optional If ``True``, write the row names (index). index_label : str, optional A column label for the ``index``. Returns ------- None : None """ file_only = PSEP.join([filename, extension]) file_all = SSEP.join([directory, file_only]) logger.info("Writing data frame to %s", file_all) try: df.to_csv(file_all, sep=separator, index=index, index_label=index_label) except: logger.info("Could not write data frame to %s", file_all)
def load_predictor(directory): r"""Load the model predictor from storage. By default, the most recent model is loaded into memory. Parameters ---------- directory : str Full directory specification of the predictor's location. Returns ------- predictor : function The scoring function. """ # Create search path search_path = SSEP.join([directory, 'model', 'model_*.pkl']) # Locate the model Pickle file try: # find the latest file filename = max(glob.iglob(search_path), key=os.path.getctime) logger.info("Loading model predictor from %s", filename) # load the model predictor predictor = joblib.load(filename) except: logging.error("Could not find model predictor in %s", search_path) # Return the model predictor return predictor
def load_feature_map(model, directory): r"""Load the feature map from storage. By default, the most recent feature map is loaded into memory. Parameters ---------- model : alphapy.Model The model object to contain the feature map. directory : str Full directory specification of the feature map's location. Returns ------- model : alphapy.Model The model object containing the feature map. """ # Locate the feature map and load it try: search_dir = SSEP.join([directory, 'model']) file_name = most_recent_file(search_dir, 'feature_map_*.pkl') logger.info("Loading feature map from %s", file_name) # load the feature map feature_map = joblib.load(file_name) model.feature_map = feature_map except: logging.error("Could not find feature map in %s", search_path) # Return the model with the feature map return model
def np_store_data(data, dir_name, file_name, extension, separator): r"""Store NumPy data in a file. Parameters ---------- data : numpy array The model component to store dir_name : str Full directory specification. file_name : str Name of the file to read, excluding the ``extension``. extension : str File name extension, e.g., ``csv``. separator : str The delimiter between fields in the file. Returns ------- None : None """ output_file = PSEP.join([file_name, extension]) output = SSEP.join([dir_name, output_file]) logger.info("Storing output to %s", output) np.savetxt(output, data, delimiter=separator)
def save_feature_map(model, timestamp): r"""Save the feature map to disk. Parameters ---------- model : alphapy.Model The model object containing the feature map. timestamp : str Date in yyyy-mm-dd format. Returns ------- None : None """ logger.info("Saving Feature Map") # Extract model parameters. directory = model.specs['directory'] # Create full path name. filename = 'feature_map_' + timestamp + '.pkl' full_path = SSEP.join([directory, 'model', filename]) # Save model object logger.info("Writing feature map to %s", full_path) joblib.dump(model.feature_map, full_path)
def get_estimators(model): r"""Define all the AlphaPy estimators based on the contents of the ``algos.yml`` file. Parameters ---------- model : alphapy.Model The model object containing global AlphaPy parameters. Returns ------- estimators : dict All of the estimators required for running the pipeline. """ # Extract model data directory = model.specs['directory'] n_estimators = model.specs['n_estimators'] n_jobs = model.specs['n_jobs'] seed = model.specs['seed'] verbosity = model.specs['verbosity'] # Initialize estimator dictionary estimators = {} # Global parameter substitution fields ps_fields = { 'n_estimators': 'n_estimators', 'n_jobs': 'n_jobs', 'nthread': 'n_jobs', 'random_state': 'seed', 'seed': 'seed', 'verbose': 'verbosity' } # Get algorithm specifications config_dir = SSEP.join([directory, 'config']) algo_specs = get_algos_config(config_dir) # Create estimators for all of the algorithms for algo in algo_specs: model_type = algo_specs[algo]['model_type'] params = algo_specs[algo]['params'] for param in params: if param in ps_fields and isinstance(param, str): algo_specs[algo]['params'][param] = eval(ps_fields[param]) func = estimator_map[algo] est = func(**params) grid = algo_specs[algo]['grid'] scoring = algo_specs[algo]['scoring'] estimators[algo] = Estimator(algo, model_type, est, grid, scoring) # return the entire classifier list return estimators
def get_algos_config(cfg_dir): r"""Read the algorithms configuration file. Parameters ---------- cfg_dir : str The directory where the configuration file ``algos.yml`` is stored. Returns ------- specs : dict The specifications for determining which algorithms to run. """ logger.info("Algorithm Configuration") # Read the configuration file full_path = SSEP.join([cfg_dir, 'algos.yml']) with open(full_path, 'r') as ymlfile: specs = yaml.load(ymlfile, Loader=yaml.FullLoader) # Find optional packages find_optional_packages() # Ensure each algorithm has required keys minimum_keys = ['model_type', 'params', 'grid'] required_keys_keras = minimum_keys + ['layers', 'compiler'] for algo in specs: if 'KERAS' in algo: required_keys = required_keys_keras else: required_keys = minimum_keys algo_keys = list(specs[algo].keys()) if set(algo_keys) != set(required_keys): logger.warning("Algorithm %s has the wrong keys %s", algo, required_keys) logger.warning("Keys found instead: %s", algo_keys) else: # determine whether or not model type is valid model_types = {x.name: x.value for x in ModelType} model_type = specs[algo]['model_type'] if model_type in model_types: specs[algo]['model_type'] = ModelType(model_types[model_type]) else: raise ValueError("algos.yml model:type %s unrecognized" % model_type) # Algorithm Specifications return specs
def get_plot_directory(model): r"""Get the plot output directory of a model. Parameters ---------- model : alphapy.Model The model object with directory information. Returns ------- plot_directory : str The output directory to write the plot. """ directory = model.specs['directory'] plot_directory = SSEP.join([directory, 'plots']) return plot_directory
def get_sport_config(): r"""Read the configuration file for SportFlow. Parameters ---------- None : None Returns ------- specs : dict The parameters for controlling SportFlow. """ # Read the configuration file full_path = SSEP.join(['.', 'config', 'sport.yml']) with open(full_path, 'r') as ymlfile: cfg = yaml.load(ymlfile) # Store configuration parameters in dictionary specs = {} # Section: sport specs['league'] = cfg['sport']['league'] specs['points_max'] = cfg['sport']['points_max'] specs['points_min'] = cfg['sport']['points_min'] specs['random_scoring'] = cfg['sport']['random_scoring'] specs['rolling_window'] = cfg['sport']['rolling_window'] specs['seasons'] = cfg['sport']['seasons'] # Log the sports parameters logger.info('SPORT PARAMETERS:') logger.info('league = %s', specs['league']) logger.info('points_max = %d', specs['points_max']) logger.info('points_min = %d', specs['points_min']) logger.info('random_scoring = %r', specs['random_scoring']) logger.info('rolling_window = %d', specs['rolling_window']) logger.info('seasons = %s', specs['seasons']) # Game Specifications return specs
def read_frame(directory, filename, extension, separator, index_col=None, squeeze=False): r"""Read a delimiter-separated file into a data frame. Parameters ---------- directory : str Full directory specification. filename : str Name of the file to read, excluding the ``extension``. extension : str File name extension, e.g., ``csv``. separator : str The delimiter between fields in the file. index_col : str, optional Column to use as the row labels in the dataframe. squeeze : bool, optional If the data contains only one column, then return a pandas Series. Returns ------- df : pandas.DataFrame The pandas dataframe loaded from the file location. If the file cannot be located, then ``None`` is returned. """ file_only = PSEP.join([filename, extension]) file_all = SSEP.join([directory, file_only]) logger.info("Loading data from %s", file_all) try: df = pd.read_csv(file_all, sep=separator, index_col=index_col, squeeze=squeeze, low_memory=False) except: df = pd.DataFrame() logger.info("Could not find or access %s", file_all) return df
def get_pandas_data(schema, symbol, lookback_period): r"""Get Pandas Web Reader data. Parameters ---------- schema : str The source of the pandas-datareader data. symbol : str A valid stock symbol. lookback_period : int The number of days of daily data to retrieve. Returns ------- df : pandas.DataFrame The dataframe containing the intraday data. """ # Quandl is a special case with subfeeds. if 'quandl' in schema: try: schema, symbol_prefix = schema.split(USEP) symbol = SSEP.join([symbol_prefix, symbol]) except: logger.info("Quandl schema format must be: quandl_DB. Ex: quandl_wiki") # Calculate the start and end date. start = datetime.now() - timedelta(lookback_period) end = datetime.now() # Call the Pandas Web data reader. df = None try: df = web.DataReader(symbol.upper(), schema, start, end) except: logger.info("Could not retrieve data for: %s", symbol) return df
def get_quandl_data(schema, subschema, symbol, intraday_data, data_fractal, from_date, to_date, lookback_period): r"""Get Quandl data. Parameters ---------- schema : str The schema for this data feed. subschema : str Any subschema for this data feed. symbol : str A valid stock symbol. intraday_data : bool If True, then get intraday data. data_fractal : str Pandas offset alias. from_date : str Starting date for symbol retrieval. to_date : str Ending date for symbol retrieval. lookback_period : int The number of periods of data to retrieve. Returns ------- df : pandas.DataFrame The dataframe containing the market data. """ # Quandl is a special case with subfeeds. symbol = SSEP.join([subschema.upper(), symbol.upper()]) # Call the Pandas Web data reader. df = get_pandas_data(schema, subschema, symbol, intraday_data, data_fractal, from_date, to_date, lookback_period) return df
def most_recent_file(directory, file_spec): r"""Find the most recent file in a directory. Parameters ---------- directory : str Full directory specification. file_spec : str Wildcard search string for the file to locate. Returns ------- file_name : str Name of the file to read, excluding the ``extension``. """ # Create search path search_path = SSEP.join([directory, file_spec]) # find the latest file file_name = max(glob.iglob(search_path), key=os.path.getctime) # load the model predictor return file_name
def load_predictor(directory): r"""Load the model predictor from storage. By default, the most recent model is loaded into memory. Parameters ---------- directory : str Full directory specification of the predictor's location. Returns ------- predictor : function The scoring function. """ # Locate the model Pickle or HD5 file search_dir = SSEP.join([directory, 'model']) file_name = most_recent_file(search_dir, 'model_*.*') # Load the model from the file file_ext = file_name.split(PSEP)[-1] if file_ext == 'pkl' or file_ext == 'h5': logger.info("Loading model predictor from %s", file_name) # load the model predictor if file_ext == 'pkl': predictor = joblib.load(file_name) elif file_ext == 'h5': predictor = load_model(file_name) else: logging.error("Could not find model predictor in %s", search_path) # Return the model predictor return predictor
def get_market_config(): r"""Read the configuration file for MarketFlow. Parameters ---------- None : None Returns ------- specs : dict The parameters for controlling MarketFlow. """ logger.info("MarketFlow Configuration") # Read the configuration file full_path = SSEP.join([PSEP, 'config', 'market.yml']) with open(full_path, 'r') as ymlfile: cfg = yaml.load(ymlfile) # Store configuration parameters in dictionary specs = {} # Section: market [this section must be first] specs['forecast_period'] = cfg['market']['forecast_period'] specs['fractal'] = cfg['market']['fractal'] specs['leaders'] = cfg['market']['leaders'] specs['data_history'] = cfg['market']['data_history'] specs['predict_history'] = cfg['market']['predict_history'] specs['schema'] = cfg['market']['schema'] specs['target_group'] = cfg['market']['target_group'] # Create the subject/schema/fractal namespace sspecs = ['stock', specs['schema'], specs['fractal']] space = Space(*sspecs) # Section: features try: logger.info("Getting Features") specs['features'] = cfg['features'] except: logger.info("No Features Found") specs['features'] = {} # Section: groups try: logger.info("Defining Groups") for g, m in cfg['groups'].items(): Group(g, space) Group.groups[g].add(m) except: logger.info("No Groups Found") # Section: aliases try: logger.info("Defining Aliases") for k, v in cfg['aliases'].items(): Alias(k, v) except: logger.info("No Aliases Found") # Section: system try: logger.info("Getting System Parameters") specs['system'] = cfg['system'] except: logger.info("No System Parameters Found") specs['system'] = {} # Section: variables try: logger.info("Defining Variables") for k, v in cfg['variables'].items(): Variable(k, v) except: logger.info("No Variables Found") # Section: functions try: logger.info("Getting Variable Functions") specs['functions'] = cfg['functions'] except: logger.info("No Variable Functions Found") specs['functions'] = {} # Log the stock parameters logger.info('MARKET PARAMETERS:') logger.info('features = %s', specs['features']) logger.info('forecast_period = %d', specs['forecast_period']) logger.info('fractal = %s', specs['fractal']) logger.info('leaders = %s', specs['leaders']) logger.info('data_history = %d', specs['data_history']) logger.info('predict_history = %s', specs['predict_history']) logger.info('schema = %s', specs['schema']) logger.info('system = %s', specs['system']) logger.info('target_group = %s', specs['target_group']) # Market Specifications return specs
def save_model(model, tag, partition): r"""Save the results in the model file. Parameters ---------- model : alphapy.Model The model object to save. tag : str A unique identifier for the output files, e.g., a date stamp. partition : alphapy.Partition Reference to the dataset. Returns ------- None : None Notes ----- The following components are extracted from the model object and saved to disk: * Model predictor (via joblib/pickle) * Predictions * Probabilities (classification only) * Rankings * Submission File (optional) """ logger.info('=' * 80) # Extract model parameters. directory = model.specs['directory'] extension = model.specs['extension'] model_type = model.specs['model_type'] submission_file = model.specs['submission_file'] submit_probas = model.specs['submit_probas'] # Get date stamp to record file creation d = datetime.now() f = "%Y%m%d" timestamp = d.strftime(f) # Save the model predictor save_predictor(model, timestamp) # Save the feature map save_feature_map(model, timestamp) # Specify input and output directories input_dir = SSEP.join([directory, 'input']) output_dir = SSEP.join([directory, 'output']) # Save predictions preds, probas = save_predictions(model, tag, partition) # Generate submission file if submission_file: sample_spec = PSEP.join([submission_file, extension]) sample_input = SSEP.join([input_dir, sample_spec]) ss = pd.read_csv(sample_input) if submit_probas and model_type == ModelType.classification: ss[ss.columns[1]] = probas else: ss[ss.columns[1]] = preds submission_base = USEP.join(['submission', timestamp]) submission_spec = PSEP.join([submission_base, extension]) submission_output = SSEP.join([output_dir, submission_spec]) logger.info("Saving Submission to %s", submission_output) ss.to_csv(submission_output, index=False)
def get_model_config(): r"""Read in the configuration file for AlphaPy. Parameters ---------- None : None Returns ------- specs : dict The parameters for controlling AlphaPy. Raises ------ ValueError Unrecognized value of a ``model.yml`` field. """ logger.info("Model Configuration") # Read the configuration file full_path = SSEP.join([PSEP, 'config', 'model.yml']) with open(full_path, 'r') as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) # Store configuration parameters in dictionary specs = {} # Section: project [this section must be first] specs['directory'] = cfg['project']['directory'] specs['extension'] = cfg['project']['file_extension'] specs['submission_file'] = cfg['project']['submission_file'] specs['submit_probas'] = cfg['project']['submit_probas'] # Section: data specs['drop'] = cfg['data']['drop'] specs['features'] = cfg['data']['features'] specs['sentinel'] = cfg['data']['sentinel'] specs['separator'] = cfg['data']['separator'] specs['shuffle'] = cfg['data']['shuffle'] specs['split'] = cfg['data']['split'] specs['target'] = cfg['data']['target'] specs['target_value'] = cfg['data']['target_value'] # sampling specs['sampling'] = cfg['data']['sampling']['option'] # determine whether or not sampling method is valid samplers = {x.name: x.value for x in SamplingMethod} sampling_method = cfg['data']['sampling']['method'] if sampling_method in samplers: specs['sampling_method'] = SamplingMethod(samplers[sampling_method]) else: raise ValueError("model.yml data:sampling:method %s unrecognized" % sampling_method) # end of sampling method specs['sampling_ratio'] = cfg['data']['sampling']['ratio'] # Section: features # clustering specs['clustering'] = cfg['features']['clustering']['option'] specs['cluster_min'] = cfg['features']['clustering']['minimum'] specs['cluster_max'] = cfg['features']['clustering']['maximum'] specs['cluster_inc'] = cfg['features']['clustering']['increment'] # counts specs['counts'] = cfg['features']['counts']['option'] # encoding specs['rounding'] = cfg['features']['encoding']['rounding'] # determine whether or not encoder is valid encoders = {x.name: x.value for x in Encoders} encoder = cfg['features']['encoding']['type'] if encoder in encoders: specs['encoder'] = Encoders(encoders[encoder]) else: raise ValueError("model.yml features:encoding:type %s unrecognized" % encoder) # factors specs['factors'] = cfg['features']['factors'] # interactions specs['interactions'] = cfg['features']['interactions']['option'] specs['isample_pct'] = cfg['features']['interactions']['sampling_pct'] specs['poly_degree'] = cfg['features']['interactions']['poly_degree'] # isomap specs['isomap'] = cfg['features']['isomap']['option'] specs['iso_components'] = cfg['features']['isomap']['components'] specs['iso_neighbors'] = cfg['features']['isomap']['neighbors'] # log transformation specs['logtransform'] = cfg['features']['logtransform']['option'] # low-variance features specs['lv_remove'] = cfg['features']['variance']['option'] specs['lv_threshold'] = cfg['features']['variance']['threshold'] # NumPy specs['numpy'] = cfg['features']['numpy']['option'] # pca specs['pca'] = cfg['features']['pca']['option'] specs['pca_min'] = cfg['features']['pca']['minimum'] specs['pca_max'] = cfg['features']['pca']['maximum'] specs['pca_inc'] = cfg['features']['pca']['increment'] specs['pca_whiten'] = cfg['features']['pca']['whiten'] # Scaling specs['scaler_option'] = cfg['features']['scaling']['option'] # determine whether or not scaling type is valid scaler_types = {x.name: x.value for x in Scalers} scaler_type = cfg['features']['scaling']['type'] if scaler_type in scaler_types: specs['scaler_type'] = Scalers(scaler_types[scaler_type]) else: raise ValueError("model.yml features:scaling:type %s unrecognized" % scaler_type) # SciPy specs['scipy'] = cfg['features']['scipy']['option'] # text specs['ngrams_max'] = cfg['features']['text']['ngrams'] specs['vectorize'] = cfg['features']['text']['vectorize'] # t-sne specs['tsne'] = cfg['features']['tsne']['option'] specs['tsne_components'] = cfg['features']['tsne']['components'] specs['tsne_learn_rate'] = cfg['features']['tsne']['learning_rate'] specs['tsne_perplexity'] = cfg['features']['tsne']['perplexity'] # Section: model specs['algorithms'] = cfg['model']['algorithms'] specs['cv_folds'] = cfg['model']['cv_folds'] # determine whether or not model type is valid model_types = {x.name: x.value for x in ModelType} model_type = cfg['model']['type'] if model_type in model_types: specs['model_type'] = ModelType(model_types[model_type]) else: raise ValueError("model.yml model:type %s unrecognized" % model_type) # end of model type specs['n_estimators'] = cfg['model']['estimators'] specs['pvalue_level'] = cfg['model']['pvalue_level'] specs['scorer'] = cfg['model']['scoring_function'] # calibration specs['calibration'] = cfg['model']['calibration']['option'] specs['cal_type'] = cfg['model']['calibration']['type'] # feature selection specs['feature_selection'] = cfg['model']['feature_selection']['option'] specs['fs_percentage'] = cfg['model']['feature_selection']['percentage'] specs['fs_uni_grid'] = cfg['model']['feature_selection']['uni_grid'] score_func = cfg['model']['feature_selection']['score_func'] if score_func in feature_scorers: specs['fs_score_func'] = feature_scorers[score_func] else: raise ValueError( "model.yml model:feature_selection:score_func %s unrecognized" % score_func) # grid search specs['grid_search'] = cfg['model']['grid_search']['option'] specs['gs_iters'] = cfg['model']['grid_search']['iterations'] specs['gs_random'] = cfg['model']['grid_search']['random'] specs['gs_sample'] = cfg['model']['grid_search']['subsample'] specs['gs_sample_pct'] = cfg['model']['grid_search']['sampling_pct'] # rfe specs['rfe'] = cfg['model']['rfe']['option'] specs['rfe_step'] = cfg['model']['rfe']['step'] # Section: pipeline specs['n_jobs'] = cfg['pipeline']['number_jobs'] specs['seed'] = cfg['pipeline']['seed'] specs['verbosity'] = cfg['pipeline']['verbosity'] # Section: plots specs['calibration_plot'] = cfg['plots']['calibration'] specs['confusion_matrix'] = cfg['plots']['confusion_matrix'] specs['importances'] = cfg['plots']['importances'] specs['learning_curve'] = cfg['plots']['learning_curve'] specs['roc_curve'] = cfg['plots']['roc_curve'] # Section: treatments try: specs['treatments'] = cfg['treatments'] except: specs['treatments'] = None logger.info("No Treatments Found") # Section: xgboost specs['esr'] = cfg['xgboost']['stopping_rounds'] # Log the configuration parameters logger.info('MODEL PARAMETERS:') logger.info('algorithms = %s', specs['algorithms']) logger.info('calibration = %r', specs['calibration']) logger.info('cal_type = %s', specs['cal_type']) logger.info('calibration_plot = %r', specs['calibration']) logger.info('clustering = %r', specs['clustering']) logger.info('cluster_inc = %d', specs['cluster_inc']) logger.info('cluster_max = %d', specs['cluster_max']) logger.info('cluster_min = %d', specs['cluster_min']) logger.info('confusion_matrix = %r', specs['confusion_matrix']) logger.info('counts = %r', specs['counts']) logger.info('cv_folds = %d', specs['cv_folds']) logger.info('directory = %s', specs['directory']) logger.info('extension = %s', specs['extension']) logger.info('drop = %s', specs['drop']) logger.info('encoder = %r', specs['encoder']) logger.info('esr = %d', specs['esr']) logger.info('factors = %s', specs['factors']) logger.info('features [X] = %s', specs['features']) logger.info('feature_selection = %r', specs['feature_selection']) logger.info('fs_percentage = %d', specs['fs_percentage']) logger.info('fs_score_func = %s', specs['fs_score_func']) logger.info('fs_uni_grid = %s', specs['fs_uni_grid']) logger.info('grid_search = %r', specs['grid_search']) logger.info('gs_iters = %d', specs['gs_iters']) logger.info('gs_random = %r', specs['gs_random']) logger.info('gs_sample = %r', specs['gs_sample']) logger.info('gs_sample_pct = %f', specs['gs_sample_pct']) logger.info('importances = %r', specs['importances']) logger.info('interactions = %r', specs['interactions']) logger.info('isomap = %r', specs['isomap']) logger.info('iso_components = %d', specs['iso_components']) logger.info('iso_neighbors = %d', specs['iso_neighbors']) logger.info('isample_pct = %d', specs['isample_pct']) logger.info('learning_curve = %r', specs['learning_curve']) logger.info('logtransform = %r', specs['logtransform']) logger.info('lv_remove = %r', specs['lv_remove']) logger.info('lv_threshold = %f', specs['lv_threshold']) logger.info('model_type = %r', specs['model_type']) logger.info('n_estimators = %d', specs['n_estimators']) logger.info('n_jobs = %d', specs['n_jobs']) logger.info('ngrams_max = %d', specs['ngrams_max']) logger.info('numpy = %r', specs['numpy']) logger.info('pca = %r', specs['pca']) logger.info('pca_inc = %d', specs['pca_inc']) logger.info('pca_max = %d', specs['pca_max']) logger.info('pca_min = %d', specs['pca_min']) logger.info('pca_whiten = %r', specs['pca_whiten']) logger.info('poly_degree = %d', specs['poly_degree']) logger.info('pvalue_level = %f', specs['pvalue_level']) logger.info('rfe = %r', specs['rfe']) logger.info('rfe_step = %d', specs['rfe_step']) logger.info('roc_curve = %r', specs['roc_curve']) logger.info('rounding = %d', specs['rounding']) logger.info('sampling = %r', specs['sampling']) logger.info('sampling_method = %r', specs['sampling_method']) logger.info('sampling_ratio = %f', specs['sampling_ratio']) logger.info('scaler_option = %r', specs['scaler_option']) logger.info('scaler_type = %r', specs['scaler_type']) logger.info('scipy = %r', specs['scipy']) logger.info('scorer = %s', specs['scorer']) logger.info('seed = %d', specs['seed']) logger.info('sentinel = %d', specs['sentinel']) logger.info('separator = %s', specs['separator']) logger.info('shuffle = %r', specs['shuffle']) logger.info('split = %f', specs['split']) logger.info('submission_file = %s', specs['submission_file']) logger.info('submit_probas = %r', specs['submit_probas']) logger.info('target [y] = %s', specs['target']) logger.info('target_value = %d', specs['target_value']) logger.info('treatments = %s', specs['treatments']) logger.info('tsne = %r', specs['tsne']) logger.info('tsne_components = %d', specs['tsne_components']) logger.info('tsne_learn_rate = %f', specs['tsne_learn_rate']) logger.info('tsne_perplexity = %f', specs['tsne_perplexity']) logger.info('vectorize = %r', specs['vectorize']) logger.info('verbosity = %d', specs['verbosity']) # Specifications to create the model return specs
def get_market_config(): r"""Read the configuration file for MarketFlow. Parameters ---------- None : None Returns ------- specs : dict The parameters for controlling MarketFlow. """ logger.info("MarketFlow Configuration") # Read the configuration file full_path = SSEP.join([PSEP, 'config', 'market.yml']) with open(full_path, 'r') as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) # Store configuration parameters in dictionary specs = {} # Section: market [this section must be first] specs['create_model'] = cfg['market']['create_model'] fractal = cfg['market']['data_fractal'] try: _ = pd.to_timedelta(fractal) except: logger.info("data_fractal [%s] is an invalid pandas offset", fractal) specs['data_fractal'] = fractal specs['data_history'] = cfg['market']['data_history'] specs['forecast_period'] = cfg['market']['forecast_period'] fractal = cfg['market']['fractal'] try: test_interval = pd.to_timedelta(fractal) except: logger.info("fractal [%s] is an invalid pandas offset", fractal) specs['fractal'] = fractal specs['lag_period'] = cfg['market']['lag_period'] specs['leaders'] = cfg['market']['leaders'] specs['predict_history'] = cfg['market']['predict_history'] specs['schema'] = cfg['market']['schema'] specs['subschema'] = cfg['market']['subschema'] specs['api_key_name'] = cfg['market']['api_key_name'] specs['api_key'] = cfg['market']['api_key'] specs['subject'] = cfg['market']['subject'] specs['target_group'] = cfg['market']['target_group'] # Set API Key environment variable if specs['api_key']: os.environ[specs['api_key_name']] = specs['api_key'] # Create the subject/schema/fractal namespace sspecs = [specs['subject'], specs['schema'], specs['fractal']] space = Space(*sspecs) # Section: features try: logger.info("Getting Features") specs['features'] = cfg['features'] except: logger.info("No Features Found") specs['features'] = {} # Section: groups try: logger.info("Defining Groups") for g, m in list(cfg['groups'].items()): Group(g, space) Group.groups[g].add(m) except: logger.info("No Groups Found") # Section: aliases try: logger.info("Defining Aliases") for k, v in list(cfg['aliases'].items()): Alias(k, v) except: logger.info("No Aliases Found") # Section: system try: logger.info("Getting System Parameters") specs['system'] = cfg['system'] except: logger.info("No System Parameters Found") specs['system'] = {} # Section: variables logger.info("Defining AlphaPy Variables [phigh, plow]") Variable('phigh', 'probability >= 0.7') Variable('plow', 'probability <= 0.3') try: logger.info("Defining User Variables") for k, v in list(cfg['variables'].items()): Variable(k, v) except: logger.info("No Variables Found") # Section: functions try: logger.info("Getting Variable Functions") specs['functions'] = cfg['functions'] except: logger.info("No Variable Functions Found") specs['functions'] = {} # Log the stock parameters logger.info('MARKET PARAMETERS:') logger.info('api_key = %s', specs['api_key']) logger.info('api_key_name = %s', specs['api_key_name']) logger.info('create_model = %r', specs['create_model']) logger.info('data_fractal = %s', specs['data_fractal']) logger.info('data_history = %d', specs['data_history']) logger.info('features = %s', specs['features']) logger.info('forecast_period = %d', specs['forecast_period']) logger.info('fractal = %s', specs['fractal']) logger.info('lag_period = %d', specs['lag_period']) logger.info('leaders = %s', specs['leaders']) logger.info('predict_history = %s', specs['predict_history']) logger.info('schema = %s', specs['schema']) logger.info('subject = %s', specs['subject']) logger.info('subschema = %s', specs['subschema']) logger.info('system = %s', specs['system']) logger.info('target_group = %s', specs['target_group']) # Market Specifications return specs
def save_predictions(model, tag, partition): r"""Save the predictions to disk. Parameters ---------- model : alphapy.Model The model object to save. tag : str A unique identifier for the output files, e.g., a date stamp. partition : alphapy.Partition Reference to the dataset. Returns ------- preds : numpy array The prediction vector. probas : numpy array The probability vector. """ # Extract model parameters. directory = model.specs['directory'] extension = model.specs['extension'] model_type = model.specs['model_type'] separator = model.specs['separator'] # Get date stamp to record file creation timestamp = get_datestamp() # Specify input and output directories input_dir = SSEP.join([directory, 'input']) output_dir = SSEP.join([directory, 'output']) # Read the prediction frame file_spec = ''.join([datasets[partition], '*']) file_name = most_recent_file(input_dir, file_spec) file_name = file_name.split(SSEP)[-1].split(PSEP)[0] pf = read_frame(input_dir, file_name, extension, separator) # Cull records before the prediction date try: predict_date = model.specs['predict_date'] found_pdate = True except: found_pdate = False if found_pdate: pd_indices = pf[pf.date >= predict_date].index.tolist() pf = pf.iloc[pd_indices] else: pd_indices = pf.index.tolist() # Save predictions for all projects logger.info("Saving Predictions") output_file = USEP.join(['predictions', timestamp]) preds = model.preds[(tag, partition)].squeeze() if found_pdate: preds = np.take(preds, pd_indices) pred_series = pd.Series(preds, index=pd_indices) df_pred = pd.DataFrame(pred_series, columns=['prediction']) write_frame(df_pred, output_dir, output_file, extension, separator) # Save probabilities for classification projects probas = None if model_type == ModelType.classification: logger.info("Saving Probabilities") output_file = USEP.join(['probabilities', timestamp]) probas = model.probas[(tag, partition)].squeeze() if found_pdate: probas = np.take(probas, pd_indices) prob_series = pd.Series(probas, index=pd_indices) df_prob = pd.DataFrame(prob_series, columns=['probability']) write_frame(df_prob, output_dir, output_file, extension, separator) # Save ranked predictions logger.info("Saving Ranked Predictions") pf['prediction'] = pred_series if model_type == ModelType.classification: pf['probability'] = prob_series pf.sort_values('probability', ascending=False, inplace=True) else: pf.sort_values('prediction', ascending=False, inplace=True) output_file = USEP.join(['rankings', timestamp]) write_frame(pf, output_dir, output_file, extension, separator) # Return predictions and any probabilities return preds, probas
def run_analysis(analysis, lag_period, forecast_period, leaders, predict_history, splits=True): r"""Run an analysis for a given model and group. First, the data are loaded for each member of the analysis group. Then, the target value is lagged for the ``forecast_period``, and any ``leaders`` are lagged as well. Each frame is split along the ``predict_date`` from the ``analysis``, and finally the train and test files are generated. Parameters ---------- analysis : alphapy.Analysis The analysis to run. lag_period : int The number of lagged features for the analysis. forecast_period : int The period for forecasting the target of the analysis. leaders : list The features that are contemporaneous with the target. predict_history : int The number of periods required for lookback calculations. splits : bool, optional If ``True``, then the data for each member of the analysis group are in separate files. Returns ------- analysis : alphapy.Analysis The completed analysis. """ # Unpack analysis name = analysis.name model = analysis.model group = analysis.group # Unpack model data predict_file = model.predict_file test_file = model.test_file train_file = model.train_file # Unpack model specifications directory = model.specs['directory'] extension = model.specs['extension'] predict_date = model.specs['predict_date'] predict_mode = model.specs['predict_mode'] separator = model.specs['separator'] target = model.specs['target'] train_date = model.specs['train_date'] # Calculate split date logger.info("Analysis Dates") split_date = subtract_days(predict_date, predict_history) logger.info("Train Date: %s", train_date) logger.info("Split Date: %s", split_date) logger.info("Test Date: %s", predict_date) # Load the data frames data_frames = load_frames(group, directory, extension, separator, splits) # Create dataframes if predict_mode: # create predict frame predict_frame = pd.DataFrame() else: # create train and test frames train_frame = pd.DataFrame() test_frame = pd.DataFrame() # Subset each individual frame and add to the master frame leaders.extend([TAG_ID]) for df in data_frames: try: tag = df[TAG_ID].unique()[0] except: tag = 'Unknown' first_date = df.index[0] last_date = df.index[-1] logger.info("Analyzing %s from %s to %s", tag, first_date, last_date) # sequence leaders, laggards, and target(s) df = sequence_frame(df, target, forecast_period, leaders, lag_period) # get frame subsets if predict_mode: new_predict = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_predict) > 0: predict_frame = predict_frame.append(new_predict) else: logger.info( "Prediction frame %s has zero rows. Check prediction date.", tag) else: # split data into train and test new_train = df.loc[(df.index >= train_date) & (df.index < split_date)] if len(new_train) > 0: new_train = new_train.dropna() train_frame = train_frame.append(new_train) new_test = df.loc[(df.index >= split_date) & (df.index <= last_date)] if len(new_test) > 0: # check if target column has NaN values nan_count = df[target].isnull().sum() forecast_check = forecast_period - 1 if nan_count != forecast_check: logger.info("%s has %d records with NaN targets", tag, nan_count) # drop records with NaN values in target column new_test = new_test.dropna(subset=[target]) # append selected records to the test frame test_frame = test_frame.append(new_test) else: logger.info( "Testing frame %s has zero rows. Check prediction date.", tag) else: logger.info( "Training frame %s has zero rows. Check data source.", tag) # Write out the frames for input into the AlphaPy pipeline directory = SSEP.join([directory, 'input']) if predict_mode: # write out the predict frame write_frame(predict_frame, directory, predict_file, extension, separator, index=True, index_label='date') else: # write out the train and test frames write_frame(train_frame, directory, train_file, extension, separator, index=True, index_label='date') write_frame(test_frame, directory, test_file, extension, separator, index=True, index_label='date') # Run the AlphaPy pipeline analysis.model = main_pipeline(model) # Return the analysis return analysis
def main(args=None): r"""MarketFlow Main Program Notes ----- (1) Initialize logging. (2) Parse the command line arguments. (3) Get the market configuration. (4) Get the model configuration. (5) Create the model object. (6) Call the main MarketFlow pipeline. Raises ------ ValueError Training date must be before prediction date. """ # Suppress Warnings warnings.simplefilter(action='ignore', category=DeprecationWarning) warnings.simplefilter(action='ignore', category=FutureWarning) # Logging logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s", filename="market_flow.log", filemode='a', level=logging.DEBUG, datefmt='%m/%d/%y %H:%M:%S') formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s", datefmt='%m/%d/%y %H:%M:%S') console = logging.StreamHandler() console.setFormatter(formatter) console.setLevel(logging.INFO) logging.getLogger().addHandler(console) # Start the pipeline logger.info('*' * 80) logger.info("MarketFlow Start") logger.info('*' * 80) # Argument Parsing parser = argparse.ArgumentParser(description="MarketFlow Parser") parser.add_argument('--pdate', dest='predict_date', help="prediction date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_argument('--tdate', dest='train_date', help="training date is in the format: YYYY-MM-DD", required=False, type=valid_date) parser.add_mutually_exclusive_group(required=False) parser.add_argument('--predict', dest='predict_mode', action='store_true') parser.add_argument('--train', dest='predict_mode', action='store_false') parser.set_defaults(predict_mode=False) args = parser.parse_args() # Set train and predict dates if args.train_date: train_date = args.train_date else: train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d") if args.predict_date: predict_date = args.predict_date else: predict_date = datetime.date.today().strftime("%Y-%m-%d") # Verify that the dates are in sequence. if train_date >= predict_date: raise ValueError("Training date must be before prediction date") else: logger.info("Training Date: %s", train_date) logger.info("Prediction Date: %s", predict_date) # Read stock configuration file market_specs = get_market_config() # Read model configuration file model_specs = get_model_config() model_specs['predict_mode'] = args.predict_mode model_specs['predict_date'] = predict_date model_specs['train_date'] = train_date # Create directories if necessary output_dirs = [ 'config', 'data', 'input', 'model', 'output', 'plots', 'systems' ] for od in output_dirs: output_dir = SSEP.join([model_specs['directory'], od]) if not os.path.exists(output_dir): logger.info("Creating directory %s", output_dir) os.makedirs(output_dir) # Create a model object from the specifications model = Model(model_specs) # Start the pipeline model = market_pipeline(model, market_specs) # Complete the pipeline logger.info('*' * 80) logger.info("MarketFlow End") logger.info('*' * 80)
def training_pipeline(model): r"""AlphaPy Training Pipeline Parameters ---------- model : alphapy.Model The model object for controlling the pipeline. Returns ------- model : alphapy.Model The final results are stored in the model object. Raises ------ KeyError If the number of columns of the train and test data do not match, then this exception is raised. """ logger.info("Training Pipeline") # Unpack the model specifications calibration = model.specs['calibration'] directory = model.specs['directory'] drop = model.specs['drop'] extension = model.specs['extension'] feature_selection = model.specs['feature_selection'] grid_search = model.specs['grid_search'] model_type = model.specs['model_type'] predict_mode = model.specs['predict_mode'] rfe = model.specs['rfe'] sampling = model.specs['sampling'] scorer = model.specs['scorer'] separator = model.specs['separator'] target = model.specs['target'] # Get train and test data X_train, y_train = get_data(model, Partition.train) X_test, y_test = get_data(model, Partition.test) # Determine if there are any test labels if y_test.any(): logger.info("Test Labels Found") model.test_labels = True model = save_features(model, X_train, X_test, y_train, y_test) # Log feature statistics logger.info("Original Feature Statistics") logger.info("Number of Training Rows : %d", X_train.shape[0]) logger.info("Number of Training Columns : %d", X_train.shape[1]) if model_type == ModelType.classification: uv, uc = np.unique(y_train, return_counts=True) logger.info("Unique Training Values for %s : %s", target, uv) logger.info("Unique Training Counts for %s : %s", target, uc) logger.info("Number of Testing Rows : %d", X_test.shape[0]) logger.info("Number of Testing Columns : %d", X_test.shape[1]) if model_type == ModelType.classification and model.test_labels: uv, uc = np.unique(y_test, return_counts=True) logger.info("Unique Testing Values for %s : %s", target, uv) logger.info("Unique Testing Counts for %s : %s", target, uc) # Merge training and test data if X_train.shape[1] == X_test.shape[1]: split_point = X_train.shape[0] X = pd.concat([X_train, X_test]) else: raise IndexError( "The number of training and test columns [%d, %d] must match." % (X_train.shape[1], X_test.shape[1])) # Apply treatments to the feature matrix all_features = apply_treatments(model, X) # Drop features all_features = drop_features(all_features, drop) # Save the train and test files with extracted and dropped features datestamp = get_datestamp() data_dir = SSEP.join([directory, 'input']) df_train = all_features.iloc[:split_point, :] df_train = pd.concat( [df_train, pd.DataFrame(y_train, columns=[target])], axis=1) output_file = USEP.join([model.train_file, datestamp]) write_frame(df_train, data_dir, output_file, extension, separator) df_test = all_features.iloc[split_point:, :] if y_test.any(): df_test = pd.concat( [df_test, pd.DataFrame(y_test, columns=[target])], axis=1) output_file = USEP.join([model.test_file, datestamp]) write_frame(df_test, data_dir, output_file, extension, separator) # Create crosstabs for any categorical features if model_type == ModelType.classification: create_crosstabs(model) # Create initial features all_features = create_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Generate interactions all_features = create_interactions(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Remove low-variance features all_features = remove_lv_features(model, all_features) X_train, X_test = np.array_split(all_features, [split_point]) model = save_features(model, X_train, X_test) # Shuffle the data [if specified] model = shuffle_data(model) # Oversampling or Undersampling [if specified] if model_type == ModelType.classification: if sampling: model = sample_data(model) else: logger.info("Skipping Sampling") # Get sample weights (classification only) model = get_class_weights(model) # Perform feature selection, independent of algorithm if feature_selection: model = select_features(model) # Get the available classifiers and regressors logger.info("Getting All Estimators") estimators = get_estimators(model) # Get the available scorers if scorer not in scorers: raise KeyError("Scorer function %s not found" % scorer) # Model Selection logger.info("Selecting Models") for algo in model.algolist: logger.info("Algorithm: %s", algo) # select estimator try: estimator = estimators[algo] scoring = estimator.scoring est = estimator.estimator except KeyError: logger.info("Algorithm %s not found", algo) # initial fit model = first_fit(model, algo, est) # recursive feature elimination if rfe: if scoring: model = rfecv_search(model, algo) elif hasattr(est, "coef_"): model = rfe_search(model, algo) else: logger.info("No RFE Available for %s", algo) # grid search if grid_search: model = hyper_grid_search(model, estimator) # predictions model = make_predictions(model, algo, calibration) # Create a blended estimator if len(model.algolist) > 1: model = predict_blend(model) # Generate metrics model = generate_metrics(model, Partition.train) model = generate_metrics(model, Partition.test) # Store the best estimator model = predict_best(model) # Generate plots generate_plots(model, Partition.train) if model.test_labels: generate_plots(model, Partition.test) # Save best features and predictions save_model(model, 'BEST', Partition.test) # Return the model return model
def get_estimators(model): r"""Define all the AlphaPy estimators based on the contents of the ``algos.yml`` file. Parameters ---------- model : alphapy.Model The model object containing global AlphaPy parameters. Returns ------- estimators : dict All of the estimators required for running the pipeline. """ # Extract model data directory = model.specs['directory'] n_estimators = model.specs['n_estimators'] n_jobs = model.specs['n_jobs'] seed = model.specs['seed'] verbosity = model.specs['verbosity'] # Reference training data for Keras input_dim X_train = model.X_train # Initialize estimator dictionary estimators = {} # Global parameter substitution fields ps_fields = {'n_estimators' : 'n_estimators', 'iterations' : 'n_estimators', 'n_jobs' : 'n_jobs', 'nthread' : 'n_jobs', 'thread_count' : 'n_jobs', 'seed' : 'seed', 'random_state' : 'seed', 'random_seed' : 'seed', 'verbosity' : 'verbosity', 'verbose' : 'verbosity'} # Get algorithm specifications config_dir = SSEP.join([directory, 'config']) algo_specs = get_algos_config(config_dir) # Create estimators for all of the algorithms for algo in algo_specs: model_type = algo_specs[algo]['model_type'] params = algo_specs[algo]['params'] for param in params: if param in ps_fields and isinstance(param, str): algo_specs[algo]['params'][param] = eval(ps_fields[param]) try: algo_found = True func = estimator_map[algo] except: algo_found = False logger.info("Algorithm %s not found (check package installation)" % algo) if algo_found: if 'KERAS' in algo: params['build_fn'] = create_keras_model layers = algo_specs[algo]['layers'] params['nlayers'] = len(layers) input_dim_string = ', input_dim={})'.format(X_train.shape[1]) layers[0] = layers[0].replace(')', input_dim_string) for i, layer in enumerate(layers): params['layer'+str(i+1)] = layer compiler = algo_specs[algo]['compiler'] params['optimizer'] = compiler['optimizer'] params['loss'] = compiler['loss'] try: params['metrics'] = compiler['metrics'] except: pass est = func(**params) grid = algo_specs[algo]['grid'] estimators[algo] = Estimator(algo, model_type, est, grid) # return the entire classifier list return estimators
def main(args=None): r"""AlphaPy Main Program Notes ----- (1) Initialize logging. (2) Parse the command line arguments. (3) Get the model configuration. (4) Create the model object. (5) Call the main AlphaPy pipeline. """ # Logging logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s", filename="alphapy.log", filemode='a', level=logging.DEBUG, datefmt='%m/%d/%y %H:%M:%S') formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s", datefmt='%m/%d/%y %H:%M:%S') console = logging.StreamHandler() console.setFormatter(formatter) console.setLevel(logging.INFO) logging.getLogger().addHandler(console) # Start the pipeline logger.info('*' * 80) logger.info("AlphaPy Start") logger.info('*' * 80) # Argument Parsing parser = argparse.ArgumentParser(description="AlphaPy Parser") parser.add_mutually_exclusive_group(required=False) parser.add_argument('--predict', dest='predict_mode', action='store_true') parser.add_argument('--train', dest='predict_mode', action='store_false') parser.set_defaults(predict_mode=False) args = parser.parse_args() # Read configuration file specs = get_model_config() specs['predict_mode'] = args.predict_mode # Create directories if necessary output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots'] for od in output_dirs: output_dir = SSEP.join([specs['directory'], od]) if not os.path.exists(output_dir): logger.info("Creating directory %s", output_dir) os.makedirs(output_dir) # Create a model from the arguments logger.info("Creating Model") model = Model(specs) # Start the pipeline logger.info("Calling Pipeline") model = main_pipeline(model) # Complete the pipeline logger.info('*' * 80) logger.info("AlphaPy End") logger.info('*' * 80)
def write_plot(vizlib, plot, plot_type, tag, directory=None): r"""Save the plot to a file, or display it interactively. Parameters ---------- vizlib : str The visualization library: ``'matplotlib'``, ``'seaborn'``, or ``'bokeh'``. plot : module Plotting context, e.g., ``plt``. plot_type : str Type of plot to generate. tag : str Unique identifier for the plot. directory : str, optional The full specification for the directory location. if ``directory`` is *None*, then the plot is displayed interactively. Returns ------- None : None. Raises ------ ValueError Unrecognized data visualization library. References ---------- Visualization Libraries: * Matplotlib : http://matplotlib.org/ * Seaborn : https://seaborn.pydata.org/ * Bokeh : http://bokeh.pydata.org/en/latest/ """ # Validate visualization library if (vizlib == 'matplotlib' or vizlib == 'seaborn' or vizlib == 'bokeh'): # supported library pass elif vizlib == 'plotly': raise ValueError("Unsupported data visualization library: %s" % vizlib) else: raise ValueError("Unrecognized data visualization library: %s" % vizlib) # Save or display the plot if directory: if vizlib == 'bokeh': file_only = ''.join([plot_type, USEP, tag, '.html']) else: file_only = ''.join([plot_type, USEP, tag, '.png']) file_all = SSEP.join([directory, file_only]) logger.info("Writing plot to %s", file_all) if vizlib == 'matplotlib': plot.savefig(file_all) elif vizlib == 'seaborn': plot.savefig(file_all) else: output_file(file_all, title=tag) show(plot) else: if vizlib == 'bokeh': show(plot) else: plot.plot()
def run_system(model, system, group, quantity=1): r"""Run a system for a given group, creating a trades frame. Parameters ---------- model : alphapy.Model The model object with specifications. system : alphapy.System or str The system to run, either a long/short system or a local one identified by function name, e.g., 'open_range_breakout'. group : alphapy.Group The group of symbols to test. quantity : float The amount to trade for each symbol, e.g., number of shares Returns ------- tf : pandas.DataFrame All of the trades for this ``group``. """ if system.__class__ == str: system_name = system else: system_name = system.name logger.info("Generating Trades for System %s", system_name) # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Extract the group information. gname = group.name gmembers = group.members gspace = group.space # Run the system for each member of the group gtlist = [] for symbol in gmembers: # generate the trades for this member if system.__class__ == str: try: tlist = globals()[system_name](symbol, gspace, quantity) except: logger.info("Could not execute system for %s", symbol) else: # call default long/short system tlist = long_short(system, symbol, gspace, quantity) if tlist: # create the local trades frame df = DataFrame.from_items(tlist, orient='index', columns=Trade.states) # add trades to global trade list for item in tlist: gtlist.append(item) else: logger.info("No trades for symbol %s", symbol) # Create group trades frame tf = None if gtlist: tspace = Space(system_name, "trades", group.space.fractal) gtlist = sorted(gtlist, key=lambda x: x[0]) tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states) tfname = frame_name(gname, tspace) system_dir = SSEP.join([directory, 'systems']) write_frame(tf, system_dir, tfname, extension, separator, index=True) del tspace else: logger.info("No trades were found") # Return trades frame return tf
def trade_system(model, system, space, intraday, name, quantity): r"""Trade the given system. Parameters ---------- model : alphapy.Model The model object with specifications. system : alphapy.System The long/short system to run. space : alphapy.Space Namespace of instrument prices. intraday : bool If True, then run an intraday system. name : str The symbol to trade. quantity : float The amount of the ``name`` to trade, e.g., number of shares Returns ------- tradelist : list List of trade entries and exits. Other Parameters ---------------- Frame.frames : dict All of the data frames containing price data. """ # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Unpack the system parameters. longentry = system.longentry shortentry = system.shortentry longexit = system.longexit shortexit = system.shortexit holdperiod = system.holdperiod scale = system.scale # Determine whether or not this is a model-driven system. entries_and_exits = [longentry, shortentry, longexit, shortexit] active_signals = [x for x in entries_and_exits if x is not None] use_model = False for signal in active_signals: if any(x in signal for x in ['phigh', 'plow']): use_model = True # Read in the price frame pf = Frame.frames[frame_name(name, space)].df # Use model output probabilities as input to the system if use_model: # get latest probabilities file probs_dir = SSEP.join([directory, 'output']) file_path = most_recent_file(probs_dir, 'probabilities*') file_name = file_path.split(SSEP)[-1].split('.')[0] # read the probabilities frame and trim the price frame probs_frame = read_frame(probs_dir, file_name, extension, separator) pf = pf[-probs_frame.shape[0]:] probs_frame.index = pf.index probs_frame.columns = ['probability'] # add probability column to price frame pf = pd.concat([pf, probs_frame], axis=1) # Evaluate the long and short events in the price frame for signal in active_signals: vexec(pf, signal) # Initialize trading state variables inlong = False inshort = False h = 0 p = 0 q = quantity tradelist = [] # Loop through prices and generate trades for dt, row in pf.iterrows(): # get closing price c = row['close'] if intraday: bar_number = row['bar_number'] end_of_day = row['end_of_day'] # evaluate entry and exit conditions lerow = row[longentry] if longentry else None serow = row[shortentry] if shortentry else None lxrow = row[longexit] if longexit else None sxrow = row[shortexit] if shortexit else None # process the long and short events if lerow: if p < 0: # short active, so exit short tradelist.append((dt, [name, Orders.sx, -p, c])) inshort = False h = 0 p = 0 if p == 0 or scale: # go long (again) tradelist.append((dt, [name, Orders.le, q, c])) inlong = True p = p + q elif serow: if p > 0: # long active, so exit long tradelist.append((dt, [name, Orders.lx, -p, c])) inlong = False h = 0 p = 0 if p == 0 or scale: # go short (again) tradelist.append((dt, [name, Orders.se, -q, c])) inshort = True p = p - q # check exit conditions if inlong and h > 0 and lxrow: # long active, so exit long tradelist.append((dt, [name, Orders.lx, -p, c])) inlong = False h = 0 p = 0 if inshort and h > 0 and sxrow: # short active, so exit short tradelist.append((dt, [name, Orders.sx, -p, c])) inshort = False h = 0 p = 0 # if a holding period was given, then check for exit if holdperiod and h >= holdperiod: if inlong: tradelist.append((dt, [name, Orders.lh, -p, c])) inlong = False if inshort: tradelist.append((dt, [name, Orders.sh, -p, c])) inshort = False h = 0 p = 0 # increment the hold counter if inlong or inshort: h += 1 if intraday and end_of_day: if inlong: # long active, so exit long tradelist.append((dt, [name, Orders.lx, -p, c])) inlong = False if inshort: # short active, so exit short tradelist.append((dt, [name, Orders.sx, -p, c])) inshort = False h = 0 p = 0 return tradelist
def run_system(model, system, group, intraday=False, quantity=1): r"""Run a system for a given group, creating a trades frame. Parameters ---------- model : alphapy.Model The model object with specifications. system : alphapy.System The system to run. group : alphapy.Group The group of symbols to trade. intraday : bool, optional If true, this is an intraday system. quantity : float, optional The amount to trade for each symbol, e.g., number of shares Returns ------- tf : pandas.DataFrame All of the trades for this ``group``. """ system_name = system.name logger.info("Generating Trades for System %s", system_name) # Unpack the model data. directory = model.specs['directory'] extension = model.specs['extension'] separator = model.specs['separator'] # Extract the group information. gname = group.name gmembers = group.members gspace = group.space # Run the system for each member of the group gtlist = [] for symbol in gmembers: # generate the trades for this member tlist = trade_system(model, system, gspace, intraday, symbol, quantity) if tlist: # add trades to global trade list for item in tlist: gtlist.append(item) else: logger.info("No trades for symbol %s", symbol) # Create group trades frame tf = None if gtlist: tspace = Space(system_name, "trades", group.space.fractal) gtlist = sorted(gtlist, key=lambda x: x[0]) tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states) tfname = frame_name(gname, tspace) system_dir = SSEP.join([directory, 'systems']) labels = ['date'] if intraday: labels.append('time') write_frame(tf, system_dir, tfname, extension, separator, index=True, index_label=labels) del tspace else: logger.info("No trades were found") # Return trades frame return tf