예제 #1
0
def load_predictor(directory):
    r"""Load the model predictor from storage. By default, the
    most recent model is loaded into memory.

    Parameters
    ----------
    directory : str
        Full directory specification of the predictor's location.

    Returns
    -------
    predictor : function
        The scoring function.

    """

    # Locate the model Pickle file

    try:
        search_dir = SSEP.join([directory, 'model'])
        file_name = most_recent_file(search_dir, 'model_*.pkl')
        logger.info("Loading model predictor from %s", file_name)
        # load the model predictor
        predictor = joblib.load(file_name)
    except:
        logging.error("Could not find model predictor in %s", search_path)

    # Return the model predictor
    return predictor
예제 #2
0
def save_predictor(model, timestamp):
    r"""Save the time-stamped model predictor to disk.

    Parameters
    ----------
    model : alphapy.Model
        The model object that contains the best estimator.
    timestamp : str
        Date in yyyy-mm-dd format.

    Returns
    -------
    None : None

    """

    logger.info("Saving Model Predictor")

    # Extract model parameters.
    directory = model.specs['directory']

    # Get the best predictor
    predictor = model.estimators['BEST']

    # Create full path name.

    filename = 'model_' + timestamp + '.pkl'
    full_path = SSEP.join([directory, 'model', filename])

    # Save model object

    logger.info("Writing model predictor to %s", full_path)
    joblib.dump(predictor, full_path)
예제 #3
0
def write_frame(df, directory, filename, extension, separator,
                index=False, index_label=None):
    r"""Write a dataframe into a delimiter-separated file.

    Parameters
    ----------
    df : pandas.DataFrame
        The pandas dataframe to save to a file.
    directory : str
        Full directory specification.
    filename : str
        Name of the file to write, excluding the ``extension``.
    extension : str
        File name extension, e.g., ``csv``.
    separator : str
        The delimiter between fields in the file.
    index : bool, optional
        If ``True``, write the row names (index).
    index_label : str, optional
        A column label for the ``index``.

    Returns
    -------
    None : None

    """
    file_only = PSEP.join([filename, extension])
    file_all = SSEP.join([directory, file_only])
    logger.info("Writing data frame to %s", file_all)
    try:
        df.to_csv(file_all, sep=separator, index=index, index_label=index_label)
    except:
        logger.info("Could not write data frame to %s", file_all)
예제 #4
0
def load_predictor(directory):
    r"""Load the model predictor from storage. By default, the
    most recent model is loaded into memory.

    Parameters
    ----------
    directory : str
        Full directory specification of the predictor's location.

    Returns
    -------
    predictor : function
        The scoring function.

    """

    # Create search path
    search_path = SSEP.join([directory, 'model', 'model_*.pkl'])

    # Locate the model Pickle file

    try:
        # find the latest file
        filename = max(glob.iglob(search_path), key=os.path.getctime)
        logger.info("Loading model predictor from %s", filename)
        # load the model predictor
        predictor = joblib.load(filename)
    except:
        logging.error("Could not find model predictor in %s", search_path)

    # Return the model predictor
    return predictor
예제 #5
0
def load_feature_map(model, directory):
    r"""Load the feature map from storage. By default, the
    most recent feature map is loaded into memory.

    Parameters
    ----------
    model : alphapy.Model
        The model object to contain the feature map.
    directory : str
        Full directory specification of the feature map's location.

    Returns
    -------
    model : alphapy.Model
        The model object containing the feature map.

    """

    # Locate the feature map and load it

    try:
        search_dir = SSEP.join([directory, 'model'])
        file_name = most_recent_file(search_dir, 'feature_map_*.pkl')
        logger.info("Loading feature map from %s", file_name)
        # load the feature map
        feature_map = joblib.load(file_name)
        model.feature_map = feature_map
    except:
        logging.error("Could not find feature map in %s", search_path)

    # Return the model with the feature map
    return model
예제 #6
0
def np_store_data(data, dir_name, file_name, extension, separator):
    r"""Store NumPy data in a file.

    Parameters
    ----------
    data : numpy array
        The model component to store
    dir_name : str
        Full directory specification.
    file_name : str
        Name of the file to read, excluding the ``extension``.
    extension : str
        File name extension, e.g., ``csv``.
    separator : str
        The delimiter between fields in the file.

    Returns
    -------
    None : None

    """
    output_file = PSEP.join([file_name, extension])
    output = SSEP.join([dir_name, output_file])
    logger.info("Storing output to %s", output)
    np.savetxt(output, data, delimiter=separator)
예제 #7
0
def save_feature_map(model, timestamp):
    r"""Save the feature map to disk.

    Parameters
    ----------
    model : alphapy.Model
        The model object containing the feature map.
    timestamp : str
        Date in yyyy-mm-dd format.

    Returns
    -------
    None : None

    """

    logger.info("Saving Feature Map")

    # Extract model parameters.
    directory = model.specs['directory']

    # Create full path name.

    filename = 'feature_map_' + timestamp + '.pkl'
    full_path = SSEP.join([directory, 'model', filename])

    # Save model object

    logger.info("Writing feature map to %s", full_path)
    joblib.dump(model.feature_map, full_path)
예제 #8
0
def get_estimators(model):
    r"""Define all the AlphaPy estimators based on the contents
    of the ``algos.yml`` file.

    Parameters
    ----------
    model : alphapy.Model
        The model object containing global AlphaPy parameters.

    Returns
    -------
    estimators : dict
        All of the estimators required for running the pipeline.

    """

    # Extract model data

    directory = model.specs['directory']
    n_estimators = model.specs['n_estimators']
    n_jobs = model.specs['n_jobs']
    seed = model.specs['seed']
    verbosity = model.specs['verbosity']

    # Initialize estimator dictionary
    estimators = {}

    # Global parameter substitution fields
    ps_fields = {
        'n_estimators': 'n_estimators',
        'n_jobs': 'n_jobs',
        'nthread': 'n_jobs',
        'random_state': 'seed',
        'seed': 'seed',
        'verbose': 'verbosity'
    }

    # Get algorithm specifications

    config_dir = SSEP.join([directory, 'config'])
    algo_specs = get_algos_config(config_dir)

    # Create estimators for all of the algorithms

    for algo in algo_specs:
        model_type = algo_specs[algo]['model_type']
        params = algo_specs[algo]['params']
        for param in params:
            if param in ps_fields and isinstance(param, str):
                algo_specs[algo]['params'][param] = eval(ps_fields[param])
        func = estimator_map[algo]
        est = func(**params)
        grid = algo_specs[algo]['grid']
        scoring = algo_specs[algo]['scoring']
        estimators[algo] = Estimator(algo, model_type, est, grid, scoring)

    # return the entire classifier list
    return estimators
예제 #9
0
def get_algos_config(cfg_dir):
    r"""Read the algorithms configuration file.

    Parameters
    ----------
    cfg_dir : str
        The directory where the configuration file ``algos.yml``
        is stored.

    Returns
    -------
    specs : dict
        The specifications for determining which algorithms to run.

    """

    logger.info("Algorithm Configuration")

    # Read the configuration file

    full_path = SSEP.join([cfg_dir, 'algos.yml'])
    with open(full_path, 'r') as ymlfile:
        specs = yaml.load(ymlfile, Loader=yaml.FullLoader)

    # Find optional packages

    find_optional_packages()

    # Ensure each algorithm has required keys

    minimum_keys = ['model_type', 'params', 'grid']
    required_keys_keras = minimum_keys + ['layers', 'compiler']
    for algo in specs:
        if 'KERAS' in algo:
            required_keys = required_keys_keras
        else:
            required_keys = minimum_keys
        algo_keys = list(specs[algo].keys())
        if set(algo_keys) != set(required_keys):
            logger.warning("Algorithm %s has the wrong keys %s",
                           algo, required_keys)
            logger.warning("Keys found instead: %s", algo_keys)
        else:
            # determine whether or not model type is valid
            model_types = {x.name: x.value for x in ModelType}
            model_type = specs[algo]['model_type']
            if model_type in model_types:
                specs[algo]['model_type'] = ModelType(model_types[model_type])
            else:
                raise ValueError("algos.yml model:type %s unrecognized" % model_type)

    # Algorithm Specifications
    return specs
예제 #10
0
def get_plot_directory(model):
    r"""Get the plot output directory of a model.

    Parameters
    ----------
    model : alphapy.Model
        The model object with directory information.

    Returns
    -------
    plot_directory : str
        The output directory to write the plot.

    """
    directory = model.specs['directory']
    plot_directory = SSEP.join([directory, 'plots'])
    return plot_directory
예제 #11
0
def get_sport_config():
    r"""Read the configuration file for SportFlow.

    Parameters
    ----------
    None : None

    Returns
    -------
    specs : dict
        The parameters for controlling SportFlow.

    """

    # Read the configuration file

    full_path = SSEP.join(['.', 'config', 'sport.yml'])
    with open(full_path, 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    # Store configuration parameters in dictionary

    specs = {}

    # Section: sport

    specs['league'] = cfg['sport']['league']
    specs['points_max'] = cfg['sport']['points_max']
    specs['points_min'] = cfg['sport']['points_min']
    specs['random_scoring'] = cfg['sport']['random_scoring']
    specs['rolling_window'] = cfg['sport']['rolling_window']   
    specs['seasons'] = cfg['sport']['seasons']

    # Log the sports parameters

    logger.info('SPORT PARAMETERS:')
    logger.info('league           = %s', specs['league'])
    logger.info('points_max       = %d', specs['points_max'])
    logger.info('points_min       = %d', specs['points_min'])
    logger.info('random_scoring   = %r', specs['random_scoring'])
    logger.info('rolling_window   = %d', specs['rolling_window'])
    logger.info('seasons          = %s', specs['seasons'])

    # Game Specifications
    return specs
예제 #12
0
파일: frame.py 프로젝트: xr3i444/AlphaPy
def read_frame(directory,
               filename,
               extension,
               separator,
               index_col=None,
               squeeze=False):
    r"""Read a delimiter-separated file into a data frame.

    Parameters
    ----------
    directory : str
        Full directory specification.
    filename : str
        Name of the file to read, excluding the ``extension``.
    extension : str
        File name extension, e.g., ``csv``.
    separator : str
        The delimiter between fields in the file.
    index_col : str, optional
        Column to use as the row labels in the dataframe.
    squeeze : bool, optional
        If the data contains only one column, then return a pandas Series.

    Returns
    -------
    df : pandas.DataFrame
        The pandas dataframe loaded from the file location. If the file
        cannot be located, then ``None`` is returned.

    """
    file_only = PSEP.join([filename, extension])
    file_all = SSEP.join([directory, file_only])
    logger.info("Loading data from %s", file_all)
    try:
        df = pd.read_csv(file_all,
                         sep=separator,
                         index_col=index_col,
                         squeeze=squeeze,
                         low_memory=False)
    except:
        df = pd.DataFrame()
        logger.info("Could not find or access %s", file_all)
    return df
예제 #13
0
파일: data.py 프로젝트: ywuywu/ml_monorepo
def get_pandas_data(schema, symbol, lookback_period):
    r"""Get Pandas Web Reader data.

    Parameters
    ----------
    schema : str
        The source of the pandas-datareader data.
    symbol : str
        A valid stock symbol.
    lookback_period : int
        The number of days of daily data to retrieve.

    Returns
    -------
    df : pandas.DataFrame
        The dataframe containing the intraday data.

    """

    # Quandl is a special case with subfeeds.

    if 'quandl' in schema:
        try:
            schema, symbol_prefix = schema.split(USEP)
            symbol = SSEP.join([symbol_prefix, symbol])
        except:
            logger.info("Quandl schema format must be: quandl_DB. Ex: quandl_wiki")

    # Calculate the start and end date.

    start = datetime.now() - timedelta(lookback_period)
    end = datetime.now()

    # Call the Pandas Web data reader.

    df = None
    try:
        df = web.DataReader(symbol.upper(), schema, start, end)
    except:
        logger.info("Could not retrieve data for: %s", symbol)

    return df
예제 #14
0
파일: data.py 프로젝트: xr3i444/AlphaPy
def get_quandl_data(schema, subschema, symbol, intraday_data, data_fractal,
                    from_date, to_date, lookback_period):
    r"""Get Quandl data.

    Parameters
    ----------
    schema : str
        The schema for this data feed.
    subschema : str
        Any subschema for this data feed.
    symbol : str
        A valid stock symbol.
    intraday_data : bool
        If True, then get intraday data.
    data_fractal : str
        Pandas offset alias.
    from_date : str
        Starting date for symbol retrieval.
    to_date : str
        Ending date for symbol retrieval.
    lookback_period : int
        The number of periods of data to retrieve.

    Returns
    -------
    df : pandas.DataFrame
        The dataframe containing the market data.

    """

    # Quandl is a special case with subfeeds.

    symbol = SSEP.join([subschema.upper(), symbol.upper()])

    # Call the Pandas Web data reader.

    df = get_pandas_data(schema, subschema, symbol, intraday_data,
                         data_fractal, from_date, to_date, lookback_period)

    return df
예제 #15
0
def most_recent_file(directory, file_spec):
    r"""Find the most recent file in a directory.

    Parameters
    ----------
    directory : str
        Full directory specification.
    file_spec : str
        Wildcard search string for the file to locate.

    Returns
    -------
    file_name : str
        Name of the file to read, excluding the ``extension``.

    """
    # Create search path
    search_path = SSEP.join([directory, file_spec])
    # find the latest file
    file_name = max(glob.iglob(search_path), key=os.path.getctime)
    # load the model predictor
    return file_name
예제 #16
0
def load_predictor(directory):
    r"""Load the model predictor from storage. By default, the
    most recent model is loaded into memory.

    Parameters
    ----------
    directory : str
        Full directory specification of the predictor's location.

    Returns
    -------
    predictor : function
        The scoring function.

    """

    # Locate the model Pickle or HD5 file

    search_dir = SSEP.join([directory, 'model'])
    file_name = most_recent_file(search_dir, 'model_*.*')

    # Load the model from the file

    file_ext = file_name.split(PSEP)[-1]
    if file_ext == 'pkl' or file_ext == 'h5':
        logger.info("Loading model predictor from %s", file_name)
        # load the model predictor
        if file_ext == 'pkl':
            predictor = joblib.load(file_name)
        elif file_ext == 'h5':
            predictor = load_model(file_name)
    else:
        logging.error("Could not find model predictor in %s", search_path)

    # Return the model predictor
    return predictor
예제 #17
0
def get_market_config():
    r"""Read the configuration file for MarketFlow.

    Parameters
    ----------
    None : None

    Returns
    -------
    specs : dict
        The parameters for controlling MarketFlow.

    """

    logger.info("MarketFlow Configuration")

    # Read the configuration file

    full_path = SSEP.join([PSEP, 'config', 'market.yml'])
    with open(full_path, 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    # Store configuration parameters in dictionary

    specs = {}

    # Section: market [this section must be first]

    specs['forecast_period'] = cfg['market']['forecast_period']
    specs['fractal'] = cfg['market']['fractal']
    specs['leaders'] = cfg['market']['leaders']
    specs['data_history'] = cfg['market']['data_history']
    specs['predict_history'] = cfg['market']['predict_history']
    specs['schema'] = cfg['market']['schema']
    specs['target_group'] = cfg['market']['target_group']

    # Create the subject/schema/fractal namespace

    sspecs = ['stock', specs['schema'], specs['fractal']]
    space = Space(*sspecs)

    # Section: features

    try:
        logger.info("Getting Features")
        specs['features'] = cfg['features']
    except:
        logger.info("No Features Found")
        specs['features'] = {}

    # Section: groups

    try:
        logger.info("Defining Groups")
        for g, m in cfg['groups'].items():
            Group(g, space)
            Group.groups[g].add(m)
    except:
        logger.info("No Groups Found")

    # Section: aliases

    try:
        logger.info("Defining Aliases")
        for k, v in cfg['aliases'].items():
            Alias(k, v)
    except:
        logger.info("No Aliases Found")

    # Section: system

    try:
        logger.info("Getting System Parameters")
        specs['system'] = cfg['system']
    except:
        logger.info("No System Parameters Found")
        specs['system'] = {}

    # Section: variables

    try:
        logger.info("Defining Variables")
        for k, v in cfg['variables'].items():
            Variable(k, v)
    except:
        logger.info("No Variables Found")

    # Section: functions

    try:
        logger.info("Getting Variable Functions")
        specs['functions'] = cfg['functions']
    except:
        logger.info("No Variable Functions Found")
        specs['functions'] = {}

    # Log the stock parameters

    logger.info('MARKET PARAMETERS:')
    logger.info('features        = %s', specs['features'])
    logger.info('forecast_period = %d', specs['forecast_period'])
    logger.info('fractal         = %s', specs['fractal'])
    logger.info('leaders         = %s', specs['leaders'])
    logger.info('data_history    = %d', specs['data_history'])
    logger.info('predict_history = %s', specs['predict_history'])
    logger.info('schema          = %s', specs['schema'])
    logger.info('system          = %s', specs['system'])
    logger.info('target_group    = %s', specs['target_group'])

    # Market Specifications
    return specs
예제 #18
0
def save_model(model, tag, partition):
    r"""Save the results in the model file.

    Parameters
    ----------
    model : alphapy.Model
        The model object to save.
    tag : str
        A unique identifier for the output files, e.g., a date stamp.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    None : None

    Notes
    -----

    The following components are extracted from the model object
    and saved to disk:

    * Model predictor (via joblib/pickle)
    * Predictions
    * Probabilities (classification only)
    * Rankings
    * Submission File (optional)

    """

    logger.info('=' * 80)

    # Extract model parameters.

    directory = model.specs['directory']
    extension = model.specs['extension']
    model_type = model.specs['model_type']
    submission_file = model.specs['submission_file']
    submit_probas = model.specs['submit_probas']

    # Get date stamp to record file creation

    d = datetime.now()
    f = "%Y%m%d"
    timestamp = d.strftime(f)

    # Save the model predictor
    save_predictor(model, timestamp)

    # Save the feature map
    save_feature_map(model, timestamp)

    # Specify input and output directories

    input_dir = SSEP.join([directory, 'input'])
    output_dir = SSEP.join([directory, 'output'])

    # Save predictions
    preds, probas = save_predictions(model, tag, partition)

    # Generate submission file

    if submission_file:
        sample_spec = PSEP.join([submission_file, extension])
        sample_input = SSEP.join([input_dir, sample_spec])
        ss = pd.read_csv(sample_input)
        if submit_probas and model_type == ModelType.classification:
            ss[ss.columns[1]] = probas
        else:
            ss[ss.columns[1]] = preds
        submission_base = USEP.join(['submission', timestamp])
        submission_spec = PSEP.join([submission_base, extension])
        submission_output = SSEP.join([output_dir, submission_spec])
        logger.info("Saving Submission to %s", submission_output)
        ss.to_csv(submission_output, index=False)
예제 #19
0
def get_model_config():
    r"""Read in the configuration file for AlphaPy.

    Parameters
    ----------
    None : None

    Returns
    -------
    specs : dict
        The parameters for controlling AlphaPy.

    Raises
    ------
    ValueError
        Unrecognized value of a ``model.yml`` field.

    """

    logger.info("Model Configuration")

    # Read the configuration file

    full_path = SSEP.join([PSEP, 'config', 'model.yml'])
    with open(full_path, 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

    # Store configuration parameters in dictionary

    specs = {}

    # Section: project [this section must be first]

    specs['directory'] = cfg['project']['directory']
    specs['extension'] = cfg['project']['file_extension']
    specs['submission_file'] = cfg['project']['submission_file']
    specs['submit_probas'] = cfg['project']['submit_probas']

    # Section: data

    specs['drop'] = cfg['data']['drop']
    specs['features'] = cfg['data']['features']
    specs['sentinel'] = cfg['data']['sentinel']
    specs['separator'] = cfg['data']['separator']
    specs['shuffle'] = cfg['data']['shuffle']
    specs['split'] = cfg['data']['split']
    specs['target'] = cfg['data']['target']
    specs['target_value'] = cfg['data']['target_value']
    # sampling
    specs['sampling'] = cfg['data']['sampling']['option']
    # determine whether or not sampling method is valid
    samplers = {x.name: x.value for x in SamplingMethod}
    sampling_method = cfg['data']['sampling']['method']
    if sampling_method in samplers:
        specs['sampling_method'] = SamplingMethod(samplers[sampling_method])
    else:
        raise ValueError("model.yml data:sampling:method %s unrecognized" %
                         sampling_method)
    # end of sampling method
    specs['sampling_ratio'] = cfg['data']['sampling']['ratio']

    # Section: features

    # clustering
    specs['clustering'] = cfg['features']['clustering']['option']
    specs['cluster_min'] = cfg['features']['clustering']['minimum']
    specs['cluster_max'] = cfg['features']['clustering']['maximum']
    specs['cluster_inc'] = cfg['features']['clustering']['increment']
    # counts
    specs['counts'] = cfg['features']['counts']['option']
    # encoding
    specs['rounding'] = cfg['features']['encoding']['rounding']
    # determine whether or not encoder is valid
    encoders = {x.name: x.value for x in Encoders}
    encoder = cfg['features']['encoding']['type']
    if encoder in encoders:
        specs['encoder'] = Encoders(encoders[encoder])
    else:
        raise ValueError("model.yml features:encoding:type %s unrecognized" %
                         encoder)
    # factors
    specs['factors'] = cfg['features']['factors']
    # interactions
    specs['interactions'] = cfg['features']['interactions']['option']
    specs['isample_pct'] = cfg['features']['interactions']['sampling_pct']
    specs['poly_degree'] = cfg['features']['interactions']['poly_degree']
    # isomap
    specs['isomap'] = cfg['features']['isomap']['option']
    specs['iso_components'] = cfg['features']['isomap']['components']
    specs['iso_neighbors'] = cfg['features']['isomap']['neighbors']
    # log transformation
    specs['logtransform'] = cfg['features']['logtransform']['option']
    # low-variance features
    specs['lv_remove'] = cfg['features']['variance']['option']
    specs['lv_threshold'] = cfg['features']['variance']['threshold']
    # NumPy
    specs['numpy'] = cfg['features']['numpy']['option']
    # pca
    specs['pca'] = cfg['features']['pca']['option']
    specs['pca_min'] = cfg['features']['pca']['minimum']
    specs['pca_max'] = cfg['features']['pca']['maximum']
    specs['pca_inc'] = cfg['features']['pca']['increment']
    specs['pca_whiten'] = cfg['features']['pca']['whiten']
    # Scaling
    specs['scaler_option'] = cfg['features']['scaling']['option']
    # determine whether or not scaling type is valid
    scaler_types = {x.name: x.value for x in Scalers}
    scaler_type = cfg['features']['scaling']['type']
    if scaler_type in scaler_types:
        specs['scaler_type'] = Scalers(scaler_types[scaler_type])
    else:
        raise ValueError("model.yml features:scaling:type %s unrecognized" %
                         scaler_type)
    # SciPy
    specs['scipy'] = cfg['features']['scipy']['option']
    # text
    specs['ngrams_max'] = cfg['features']['text']['ngrams']
    specs['vectorize'] = cfg['features']['text']['vectorize']
    # t-sne
    specs['tsne'] = cfg['features']['tsne']['option']
    specs['tsne_components'] = cfg['features']['tsne']['components']
    specs['tsne_learn_rate'] = cfg['features']['tsne']['learning_rate']
    specs['tsne_perplexity'] = cfg['features']['tsne']['perplexity']

    # Section: model

    specs['algorithms'] = cfg['model']['algorithms']
    specs['cv_folds'] = cfg['model']['cv_folds']
    # determine whether or not model type is valid
    model_types = {x.name: x.value for x in ModelType}
    model_type = cfg['model']['type']
    if model_type in model_types:
        specs['model_type'] = ModelType(model_types[model_type])
    else:
        raise ValueError("model.yml model:type %s unrecognized" % model_type)
    # end of model type
    specs['n_estimators'] = cfg['model']['estimators']
    specs['pvalue_level'] = cfg['model']['pvalue_level']
    specs['scorer'] = cfg['model']['scoring_function']
    # calibration
    specs['calibration'] = cfg['model']['calibration']['option']
    specs['cal_type'] = cfg['model']['calibration']['type']
    # feature selection
    specs['feature_selection'] = cfg['model']['feature_selection']['option']
    specs['fs_percentage'] = cfg['model']['feature_selection']['percentage']
    specs['fs_uni_grid'] = cfg['model']['feature_selection']['uni_grid']
    score_func = cfg['model']['feature_selection']['score_func']
    if score_func in feature_scorers:
        specs['fs_score_func'] = feature_scorers[score_func]
    else:
        raise ValueError(
            "model.yml model:feature_selection:score_func %s unrecognized" %
            score_func)
    # grid search
    specs['grid_search'] = cfg['model']['grid_search']['option']
    specs['gs_iters'] = cfg['model']['grid_search']['iterations']
    specs['gs_random'] = cfg['model']['grid_search']['random']
    specs['gs_sample'] = cfg['model']['grid_search']['subsample']
    specs['gs_sample_pct'] = cfg['model']['grid_search']['sampling_pct']
    # rfe
    specs['rfe'] = cfg['model']['rfe']['option']
    specs['rfe_step'] = cfg['model']['rfe']['step']

    # Section: pipeline

    specs['n_jobs'] = cfg['pipeline']['number_jobs']
    specs['seed'] = cfg['pipeline']['seed']
    specs['verbosity'] = cfg['pipeline']['verbosity']

    # Section: plots

    specs['calibration_plot'] = cfg['plots']['calibration']
    specs['confusion_matrix'] = cfg['plots']['confusion_matrix']
    specs['importances'] = cfg['plots']['importances']
    specs['learning_curve'] = cfg['plots']['learning_curve']
    specs['roc_curve'] = cfg['plots']['roc_curve']

    # Section: treatments

    try:
        specs['treatments'] = cfg['treatments']
    except:
        specs['treatments'] = None
        logger.info("No Treatments Found")

    # Section: xgboost

    specs['esr'] = cfg['xgboost']['stopping_rounds']

    # Log the configuration parameters

    logger.info('MODEL PARAMETERS:')
    logger.info('algorithms        = %s', specs['algorithms'])
    logger.info('calibration       = %r', specs['calibration'])
    logger.info('cal_type          = %s', specs['cal_type'])
    logger.info('calibration_plot  = %r', specs['calibration'])
    logger.info('clustering        = %r', specs['clustering'])
    logger.info('cluster_inc       = %d', specs['cluster_inc'])
    logger.info('cluster_max       = %d', specs['cluster_max'])
    logger.info('cluster_min       = %d', specs['cluster_min'])
    logger.info('confusion_matrix  = %r', specs['confusion_matrix'])
    logger.info('counts            = %r', specs['counts'])
    logger.info('cv_folds          = %d', specs['cv_folds'])
    logger.info('directory         = %s', specs['directory'])
    logger.info('extension         = %s', specs['extension'])
    logger.info('drop              = %s', specs['drop'])
    logger.info('encoder           = %r', specs['encoder'])
    logger.info('esr               = %d', specs['esr'])
    logger.info('factors           = %s', specs['factors'])
    logger.info('features [X]      = %s', specs['features'])
    logger.info('feature_selection = %r', specs['feature_selection'])
    logger.info('fs_percentage     = %d', specs['fs_percentage'])
    logger.info('fs_score_func     = %s', specs['fs_score_func'])
    logger.info('fs_uni_grid       = %s', specs['fs_uni_grid'])
    logger.info('grid_search       = %r', specs['grid_search'])
    logger.info('gs_iters          = %d', specs['gs_iters'])
    logger.info('gs_random         = %r', specs['gs_random'])
    logger.info('gs_sample         = %r', specs['gs_sample'])
    logger.info('gs_sample_pct     = %f', specs['gs_sample_pct'])
    logger.info('importances       = %r', specs['importances'])
    logger.info('interactions      = %r', specs['interactions'])
    logger.info('isomap            = %r', specs['isomap'])
    logger.info('iso_components    = %d', specs['iso_components'])
    logger.info('iso_neighbors     = %d', specs['iso_neighbors'])
    logger.info('isample_pct       = %d', specs['isample_pct'])
    logger.info('learning_curve    = %r', specs['learning_curve'])
    logger.info('logtransform      = %r', specs['logtransform'])
    logger.info('lv_remove         = %r', specs['lv_remove'])
    logger.info('lv_threshold      = %f', specs['lv_threshold'])
    logger.info('model_type        = %r', specs['model_type'])
    logger.info('n_estimators      = %d', specs['n_estimators'])
    logger.info('n_jobs            = %d', specs['n_jobs'])
    logger.info('ngrams_max        = %d', specs['ngrams_max'])
    logger.info('numpy             = %r', specs['numpy'])
    logger.info('pca               = %r', specs['pca'])
    logger.info('pca_inc           = %d', specs['pca_inc'])
    logger.info('pca_max           = %d', specs['pca_max'])
    logger.info('pca_min           = %d', specs['pca_min'])
    logger.info('pca_whiten        = %r', specs['pca_whiten'])
    logger.info('poly_degree       = %d', specs['poly_degree'])
    logger.info('pvalue_level      = %f', specs['pvalue_level'])
    logger.info('rfe               = %r', specs['rfe'])
    logger.info('rfe_step          = %d', specs['rfe_step'])
    logger.info('roc_curve         = %r', specs['roc_curve'])
    logger.info('rounding          = %d', specs['rounding'])
    logger.info('sampling          = %r', specs['sampling'])
    logger.info('sampling_method   = %r', specs['sampling_method'])
    logger.info('sampling_ratio    = %f', specs['sampling_ratio'])
    logger.info('scaler_option     = %r', specs['scaler_option'])
    logger.info('scaler_type       = %r', specs['scaler_type'])
    logger.info('scipy             = %r', specs['scipy'])
    logger.info('scorer            = %s', specs['scorer'])
    logger.info('seed              = %d', specs['seed'])
    logger.info('sentinel          = %d', specs['sentinel'])
    logger.info('separator         = %s', specs['separator'])
    logger.info('shuffle           = %r', specs['shuffle'])
    logger.info('split             = %f', specs['split'])
    logger.info('submission_file   = %s', specs['submission_file'])
    logger.info('submit_probas     = %r', specs['submit_probas'])
    logger.info('target [y]        = %s', specs['target'])
    logger.info('target_value      = %d', specs['target_value'])
    logger.info('treatments        = %s', specs['treatments'])
    logger.info('tsne              = %r', specs['tsne'])
    logger.info('tsne_components   = %d', specs['tsne_components'])
    logger.info('tsne_learn_rate   = %f', specs['tsne_learn_rate'])
    logger.info('tsne_perplexity   = %f', specs['tsne_perplexity'])
    logger.info('vectorize         = %r', specs['vectorize'])
    logger.info('verbosity         = %d', specs['verbosity'])

    # Specifications to create the model
    return specs
예제 #20
0
def get_market_config():
    r"""Read the configuration file for MarketFlow.

    Parameters
    ----------
    None : None

    Returns
    -------
    specs : dict
        The parameters for controlling MarketFlow.

    """

    logger.info("MarketFlow Configuration")

    # Read the configuration file

    full_path = SSEP.join([PSEP, 'config', 'market.yml'])
    with open(full_path, 'r') as ymlfile:
        cfg = yaml.load(ymlfile, Loader=yaml.FullLoader)

    # Store configuration parameters in dictionary

    specs = {}

    # Section: market [this section must be first]

    specs['create_model'] = cfg['market']['create_model']
    fractal = cfg['market']['data_fractal']
    try:
        _ = pd.to_timedelta(fractal)
    except:
        logger.info("data_fractal [%s] is an invalid pandas offset", fractal)
    specs['data_fractal'] = fractal
    specs['data_history'] = cfg['market']['data_history']
    specs['forecast_period'] = cfg['market']['forecast_period']
    fractal = cfg['market']['fractal']
    try:
        test_interval = pd.to_timedelta(fractal)
    except:
        logger.info("fractal [%s] is an invalid pandas offset", fractal)
    specs['fractal'] = fractal
    specs['lag_period'] = cfg['market']['lag_period']
    specs['leaders'] = cfg['market']['leaders']
    specs['predict_history'] = cfg['market']['predict_history']
    specs['schema'] = cfg['market']['schema']
    specs['subschema'] = cfg['market']['subschema']
    specs['api_key_name'] = cfg['market']['api_key_name']
    specs['api_key'] = cfg['market']['api_key']
    specs['subject'] = cfg['market']['subject']
    specs['target_group'] = cfg['market']['target_group']

    # Set API Key environment variable
    if specs['api_key']:
        os.environ[specs['api_key_name']] = specs['api_key']

    # Create the subject/schema/fractal namespace

    sspecs = [specs['subject'], specs['schema'], specs['fractal']]
    space = Space(*sspecs)

    # Section: features

    try:
        logger.info("Getting Features")
        specs['features'] = cfg['features']
    except:
        logger.info("No Features Found")
        specs['features'] = {}

    # Section: groups

    try:
        logger.info("Defining Groups")
        for g, m in list(cfg['groups'].items()):
            Group(g, space)
            Group.groups[g].add(m)
    except:
        logger.info("No Groups Found")

    # Section: aliases

    try:
        logger.info("Defining Aliases")
        for k, v in list(cfg['aliases'].items()):
            Alias(k, v)
    except:
        logger.info("No Aliases Found")

    # Section: system

    try:
        logger.info("Getting System Parameters")
        specs['system'] = cfg['system']
    except:
        logger.info("No System Parameters Found")
        specs['system'] = {}

    # Section: variables

    logger.info("Defining AlphaPy Variables [phigh, plow]")

    Variable('phigh', 'probability >= 0.7')
    Variable('plow', 'probability <= 0.3')

    try:
        logger.info("Defining User Variables")
        for k, v in list(cfg['variables'].items()):
            Variable(k, v)
    except:
        logger.info("No Variables Found")

    # Section: functions

    try:
        logger.info("Getting Variable Functions")
        specs['functions'] = cfg['functions']
    except:
        logger.info("No Variable Functions Found")
        specs['functions'] = {}

    # Log the stock parameters

    logger.info('MARKET PARAMETERS:')
    logger.info('api_key         = %s', specs['api_key'])
    logger.info('api_key_name    = %s', specs['api_key_name'])
    logger.info('create_model    = %r', specs['create_model'])
    logger.info('data_fractal    = %s', specs['data_fractal'])
    logger.info('data_history    = %d', specs['data_history'])
    logger.info('features        = %s', specs['features'])
    logger.info('forecast_period = %d', specs['forecast_period'])
    logger.info('fractal         = %s', specs['fractal'])
    logger.info('lag_period      = %d', specs['lag_period'])
    logger.info('leaders         = %s', specs['leaders'])
    logger.info('predict_history = %s', specs['predict_history'])
    logger.info('schema          = %s', specs['schema'])
    logger.info('subject         = %s', specs['subject'])
    logger.info('subschema       = %s', specs['subschema'])
    logger.info('system          = %s', specs['system'])
    logger.info('target_group    = %s', specs['target_group'])

    # Market Specifications
    return specs
예제 #21
0
def save_predictions(model, tag, partition):
    r"""Save the predictions to disk.

    Parameters
    ----------
    model : alphapy.Model
        The model object to save.
    tag : str
        A unique identifier for the output files, e.g., a date stamp.
    partition : alphapy.Partition
        Reference to the dataset.

    Returns
    -------
    preds : numpy array
        The prediction vector.
    probas : numpy array
        The probability vector.

    """

    # Extract model parameters.

    directory = model.specs['directory']
    extension = model.specs['extension']
    model_type = model.specs['model_type']
    separator = model.specs['separator']

    # Get date stamp to record file creation
    timestamp = get_datestamp()

    # Specify input and output directories

    input_dir = SSEP.join([directory, 'input'])
    output_dir = SSEP.join([directory, 'output'])

    # Read the prediction frame
    file_spec = ''.join([datasets[partition], '*'])
    file_name = most_recent_file(input_dir, file_spec)
    file_name = file_name.split(SSEP)[-1].split(PSEP)[0]
    pf = read_frame(input_dir, file_name, extension, separator)

    # Cull records before the prediction date

    try:
        predict_date = model.specs['predict_date']
        found_pdate = True
    except:
        found_pdate = False

    if found_pdate:
        pd_indices = pf[pf.date >= predict_date].index.tolist()
        pf = pf.iloc[pd_indices]
    else:
        pd_indices = pf.index.tolist()

    # Save predictions for all projects

    logger.info("Saving Predictions")
    output_file = USEP.join(['predictions', timestamp])
    preds = model.preds[(tag, partition)].squeeze()
    if found_pdate:
        preds = np.take(preds, pd_indices)
    pred_series = pd.Series(preds, index=pd_indices)
    df_pred = pd.DataFrame(pred_series, columns=['prediction'])
    write_frame(df_pred, output_dir, output_file, extension, separator)

    # Save probabilities for classification projects

    probas = None
    if model_type == ModelType.classification:
        logger.info("Saving Probabilities")
        output_file = USEP.join(['probabilities', timestamp])
        probas = model.probas[(tag, partition)].squeeze()
        if found_pdate:
            probas = np.take(probas, pd_indices)
        prob_series = pd.Series(probas, index=pd_indices)
        df_prob = pd.DataFrame(prob_series, columns=['probability'])
        write_frame(df_prob, output_dir, output_file, extension, separator)

    # Save ranked predictions

    logger.info("Saving Ranked Predictions")
    pf['prediction'] = pred_series
    if model_type == ModelType.classification:
        pf['probability'] = prob_series
        pf.sort_values('probability', ascending=False, inplace=True)
    else:
        pf.sort_values('prediction', ascending=False, inplace=True)
    output_file = USEP.join(['rankings', timestamp])
    write_frame(pf, output_dir, output_file, extension, separator)

    # Return predictions and any probabilities
    return preds, probas
예제 #22
0
def run_analysis(analysis,
                 lag_period,
                 forecast_period,
                 leaders,
                 predict_history,
                 splits=True):
    r"""Run an analysis for a given model and group.

    First, the data are loaded for each member of the analysis group.
    Then, the target value is lagged for the ``forecast_period``, and
    any ``leaders`` are lagged as well. Each frame is split along
    the ``predict_date`` from the ``analysis``, and finally the
    train and test files are generated.

    Parameters
    ----------
    analysis : alphapy.Analysis
        The analysis to run.
    lag_period : int
        The number of lagged features for the analysis.
    forecast_period : int
        The period for forecasting the target of the analysis.
    leaders : list
        The features that are contemporaneous with the target.
    predict_history : int
        The number of periods required for lookback calculations.
    splits : bool, optional
        If ``True``, then the data for each member of the analysis
        group are in separate files.

    Returns
    -------
    analysis : alphapy.Analysis
        The completed analysis.

    """

    # Unpack analysis

    name = analysis.name
    model = analysis.model
    group = analysis.group

    # Unpack model data

    predict_file = model.predict_file
    test_file = model.test_file
    train_file = model.train_file

    # Unpack model specifications

    directory = model.specs['directory']
    extension = model.specs['extension']
    predict_date = model.specs['predict_date']
    predict_mode = model.specs['predict_mode']
    separator = model.specs['separator']
    target = model.specs['target']
    train_date = model.specs['train_date']

    # Calculate split date
    logger.info("Analysis Dates")
    split_date = subtract_days(predict_date, predict_history)
    logger.info("Train Date: %s", train_date)
    logger.info("Split Date: %s", split_date)
    logger.info("Test  Date: %s", predict_date)

    # Load the data frames
    data_frames = load_frames(group, directory, extension, separator, splits)

    # Create dataframes

    if predict_mode:
        # create predict frame
        predict_frame = pd.DataFrame()
    else:
        # create train and test frames
        train_frame = pd.DataFrame()
        test_frame = pd.DataFrame()

    # Subset each individual frame and add to the master frame

    leaders.extend([TAG_ID])
    for df in data_frames:
        try:
            tag = df[TAG_ID].unique()[0]
        except:
            tag = 'Unknown'
        first_date = df.index[0]
        last_date = df.index[-1]
        logger.info("Analyzing %s from %s to %s", tag, first_date, last_date)
        # sequence leaders, laggards, and target(s)
        df = sequence_frame(df, target, forecast_period, leaders, lag_period)
        # get frame subsets
        if predict_mode:
            new_predict = df.loc[(df.index >= split_date)
                                 & (df.index <= last_date)]
            if len(new_predict) > 0:
                predict_frame = predict_frame.append(new_predict)
            else:
                logger.info(
                    "Prediction frame %s has zero rows. Check prediction date.",
                    tag)
        else:
            # split data into train and test
            new_train = df.loc[(df.index >= train_date)
                               & (df.index < split_date)]
            if len(new_train) > 0:
                new_train = new_train.dropna()
                train_frame = train_frame.append(new_train)
                new_test = df.loc[(df.index >= split_date)
                                  & (df.index <= last_date)]
                if len(new_test) > 0:
                    # check if target column has NaN values
                    nan_count = df[target].isnull().sum()
                    forecast_check = forecast_period - 1
                    if nan_count != forecast_check:
                        logger.info("%s has %d records with NaN targets", tag,
                                    nan_count)
                    # drop records with NaN values in target column
                    new_test = new_test.dropna(subset=[target])
                    # append selected records to the test frame
                    test_frame = test_frame.append(new_test)
                else:
                    logger.info(
                        "Testing frame %s has zero rows. Check prediction date.",
                        tag)
            else:
                logger.info(
                    "Training frame %s has zero rows. Check data source.", tag)

    # Write out the frames for input into the AlphaPy pipeline

    directory = SSEP.join([directory, 'input'])
    if predict_mode:
        # write out the predict frame
        write_frame(predict_frame,
                    directory,
                    predict_file,
                    extension,
                    separator,
                    index=True,
                    index_label='date')
    else:
        # write out the train and test frames
        write_frame(train_frame,
                    directory,
                    train_file,
                    extension,
                    separator,
                    index=True,
                    index_label='date')
        write_frame(test_frame,
                    directory,
                    test_file,
                    extension,
                    separator,
                    index=True,
                    index_label='date')

    # Run the AlphaPy pipeline
    analysis.model = main_pipeline(model)

    # Return the analysis
    return analysis
예제 #23
0
def main(args=None):
    r"""MarketFlow Main Program

    Notes
    -----
    (1) Initialize logging.
    (2) Parse the command line arguments.
    (3) Get the market configuration.
    (4) Get the model configuration.
    (5) Create the model object.
    (6) Call the main MarketFlow pipeline.

    Raises
    ------
    ValueError
        Training date must be before prediction date.

    """

    # Suppress Warnings

    warnings.simplefilter(action='ignore', category=DeprecationWarning)
    warnings.simplefilter(action='ignore', category=FutureWarning)

    # Logging

    logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s",
                        filename="market_flow.log",
                        filemode='a',
                        level=logging.DEBUG,
                        datefmt='%m/%d/%y %H:%M:%S')
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s",
                                  datefmt='%m/%d/%y %H:%M:%S')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    console.setLevel(logging.INFO)
    logging.getLogger().addHandler(console)

    # Start the pipeline

    logger.info('*' * 80)
    logger.info("MarketFlow Start")
    logger.info('*' * 80)

    # Argument Parsing

    parser = argparse.ArgumentParser(description="MarketFlow Parser")
    parser.add_argument('--pdate',
                        dest='predict_date',
                        help="prediction date is in the format: YYYY-MM-DD",
                        required=False,
                        type=valid_date)
    parser.add_argument('--tdate',
                        dest='train_date',
                        help="training date is in the format: YYYY-MM-DD",
                        required=False,
                        type=valid_date)
    parser.add_mutually_exclusive_group(required=False)
    parser.add_argument('--predict', dest='predict_mode', action='store_true')
    parser.add_argument('--train', dest='predict_mode', action='store_false')
    parser.set_defaults(predict_mode=False)
    args = parser.parse_args()

    # Set train and predict dates

    if args.train_date:
        train_date = args.train_date
    else:
        train_date = pd.datetime(1900, 1, 1).strftime("%Y-%m-%d")

    if args.predict_date:
        predict_date = args.predict_date
    else:
        predict_date = datetime.date.today().strftime("%Y-%m-%d")

    # Verify that the dates are in sequence.

    if train_date >= predict_date:
        raise ValueError("Training date must be before prediction date")
    else:
        logger.info("Training Date: %s", train_date)
        logger.info("Prediction Date: %s", predict_date)

    # Read stock configuration file
    market_specs = get_market_config()

    # Read model configuration file

    model_specs = get_model_config()
    model_specs['predict_mode'] = args.predict_mode
    model_specs['predict_date'] = predict_date
    model_specs['train_date'] = train_date

    # Create directories if necessary

    output_dirs = [
        'config', 'data', 'input', 'model', 'output', 'plots', 'systems'
    ]
    for od in output_dirs:
        output_dir = SSEP.join([model_specs['directory'], od])
        if not os.path.exists(output_dir):
            logger.info("Creating directory %s", output_dir)
            os.makedirs(output_dir)

    # Create a model object from the specifications
    model = Model(model_specs)

    # Start the pipeline
    model = market_pipeline(model, market_specs)

    # Complete the pipeline

    logger.info('*' * 80)
    logger.info("MarketFlow End")
    logger.info('*' * 80)
예제 #24
0
def training_pipeline(model):
    r"""AlphaPy Training Pipeline

    Parameters
    ----------
    model : alphapy.Model
        The model object for controlling the pipeline.

    Returns
    -------
    model : alphapy.Model
        The final results are stored in the model object.

    Raises
    ------
    KeyError
        If the number of columns of the train and test data do not match,
        then this exception is raised.

    """

    logger.info("Training Pipeline")

    # Unpack the model specifications

    calibration = model.specs['calibration']
    directory = model.specs['directory']
    drop = model.specs['drop']
    extension = model.specs['extension']
    feature_selection = model.specs['feature_selection']
    grid_search = model.specs['grid_search']
    model_type = model.specs['model_type']
    predict_mode = model.specs['predict_mode']
    rfe = model.specs['rfe']
    sampling = model.specs['sampling']
    scorer = model.specs['scorer']
    separator = model.specs['separator']
    target = model.specs['target']

    # Get train and test data

    X_train, y_train = get_data(model, Partition.train)
    X_test, y_test = get_data(model, Partition.test)

    # Determine if there are any test labels

    if y_test.any():
        logger.info("Test Labels Found")
        model.test_labels = True
    model = save_features(model, X_train, X_test, y_train, y_test)

    # Log feature statistics

    logger.info("Original Feature Statistics")
    logger.info("Number of Training Rows    : %d", X_train.shape[0])
    logger.info("Number of Training Columns : %d", X_train.shape[1])
    if model_type == ModelType.classification:
        uv, uc = np.unique(y_train, return_counts=True)
        logger.info("Unique Training Values for %s : %s", target, uv)
        logger.info("Unique Training Counts for %s : %s", target, uc)
    logger.info("Number of Testing Rows     : %d", X_test.shape[0])
    logger.info("Number of Testing Columns  : %d", X_test.shape[1])
    if model_type == ModelType.classification and model.test_labels:
        uv, uc = np.unique(y_test, return_counts=True)
        logger.info("Unique Testing Values for %s : %s", target, uv)
        logger.info("Unique Testing Counts for %s : %s", target, uc)

    # Merge training and test data

    if X_train.shape[1] == X_test.shape[1]:
        split_point = X_train.shape[0]
        X = pd.concat([X_train, X_test])
    else:
        raise IndexError(
            "The number of training and test columns [%d, %d] must match." %
            (X_train.shape[1], X_test.shape[1]))

    # Apply treatments to the feature matrix
    all_features = apply_treatments(model, X)

    # Drop features
    all_features = drop_features(all_features, drop)

    # Save the train and test files with extracted and dropped features

    datestamp = get_datestamp()
    data_dir = SSEP.join([directory, 'input'])
    df_train = all_features.iloc[:split_point, :]
    df_train = pd.concat(
        [df_train, pd.DataFrame(y_train, columns=[target])], axis=1)
    output_file = USEP.join([model.train_file, datestamp])
    write_frame(df_train, data_dir, output_file, extension, separator)
    df_test = all_features.iloc[split_point:, :]
    if y_test.any():
        df_test = pd.concat(
            [df_test, pd.DataFrame(y_test, columns=[target])], axis=1)
    output_file = USEP.join([model.test_file, datestamp])
    write_frame(df_test, data_dir, output_file, extension, separator)

    # Create crosstabs for any categorical features

    if model_type == ModelType.classification:
        create_crosstabs(model)

    # Create initial features

    all_features = create_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Generate interactions

    all_features = create_interactions(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Remove low-variance features

    all_features = remove_lv_features(model, all_features)
    X_train, X_test = np.array_split(all_features, [split_point])
    model = save_features(model, X_train, X_test)

    # Shuffle the data [if specified]
    model = shuffle_data(model)

    # Oversampling or Undersampling [if specified]

    if model_type == ModelType.classification:
        if sampling:
            model = sample_data(model)
        else:
            logger.info("Skipping Sampling")
        # Get sample weights (classification only)
        model = get_class_weights(model)

    # Perform feature selection, independent of algorithm

    if feature_selection:
        model = select_features(model)

    # Get the available classifiers and regressors

    logger.info("Getting All Estimators")
    estimators = get_estimators(model)

    # Get the available scorers

    if scorer not in scorers:
        raise KeyError("Scorer function %s not found" % scorer)

    # Model Selection

    logger.info("Selecting Models")

    for algo in model.algolist:
        logger.info("Algorithm: %s", algo)
        # select estimator
        try:
            estimator = estimators[algo]
            scoring = estimator.scoring
            est = estimator.estimator
        except KeyError:
            logger.info("Algorithm %s not found", algo)
        # initial fit
        model = first_fit(model, algo, est)
        # recursive feature elimination
        if rfe:
            if scoring:
                model = rfecv_search(model, algo)
            elif hasattr(est, "coef_"):
                model = rfe_search(model, algo)
            else:
                logger.info("No RFE Available for %s", algo)
        # grid search
        if grid_search:
            model = hyper_grid_search(model, estimator)
        # predictions
        model = make_predictions(model, algo, calibration)

    # Create a blended estimator

    if len(model.algolist) > 1:
        model = predict_blend(model)

    # Generate metrics

    model = generate_metrics(model, Partition.train)
    model = generate_metrics(model, Partition.test)

    # Store the best estimator
    model = predict_best(model)

    # Generate plots

    generate_plots(model, Partition.train)
    if model.test_labels:
        generate_plots(model, Partition.test)

    # Save best features and predictions
    save_model(model, 'BEST', Partition.test)

    # Return the model
    return model
예제 #25
0
def get_estimators(model):
    r"""Define all the AlphaPy estimators based on the contents
    of the ``algos.yml`` file.

    Parameters
    ----------
    model : alphapy.Model
        The model object containing global AlphaPy parameters.

    Returns
    -------
    estimators : dict
        All of the estimators required for running the pipeline.

    """

    # Extract model data

    directory = model.specs['directory']
    n_estimators = model.specs['n_estimators']
    n_jobs = model.specs['n_jobs']
    seed = model.specs['seed']
    verbosity = model.specs['verbosity']

    # Reference training data for Keras input_dim
    X_train = model.X_train

    # Initialize estimator dictionary
    estimators = {}

    # Global parameter substitution fields

    ps_fields = {'n_estimators' : 'n_estimators',
                 'iterations'   : 'n_estimators',
                 'n_jobs'       : 'n_jobs',
                 'nthread'      : 'n_jobs',
                 'thread_count' : 'n_jobs',
                 'seed'         : 'seed',
                 'random_state' : 'seed',
                 'random_seed'  : 'seed',
                 'verbosity'    : 'verbosity',
                 'verbose'      : 'verbosity'}

    # Get algorithm specifications

    config_dir = SSEP.join([directory, 'config'])
    algo_specs = get_algos_config(config_dir)

    # Create estimators for all of the algorithms

    for algo in algo_specs:
        model_type = algo_specs[algo]['model_type']
        params = algo_specs[algo]['params']
        for param in params:
            if param in ps_fields and isinstance(param, str):
                algo_specs[algo]['params'][param] = eval(ps_fields[param])
        try:
            algo_found = True
            func = estimator_map[algo]
        except:
            algo_found = False
            logger.info("Algorithm %s not found (check package installation)" % algo)
        if algo_found:
            if 'KERAS' in algo:
                params['build_fn'] = create_keras_model
                layers = algo_specs[algo]['layers']
                params['nlayers'] = len(layers)
                input_dim_string = ', input_dim={})'.format(X_train.shape[1])
                layers[0] = layers[0].replace(')', input_dim_string)
                for i, layer in enumerate(layers):
                    params['layer'+str(i+1)] = layer
                compiler = algo_specs[algo]['compiler']
                params['optimizer'] = compiler['optimizer']
                params['loss'] = compiler['loss']
                try:
                    params['metrics'] = compiler['metrics']
                except:
                    pass
            est = func(**params)
            grid = algo_specs[algo]['grid']
            estimators[algo] = Estimator(algo, model_type, est, grid)
            

    # return the entire classifier list
    return estimators
예제 #26
0
def main(args=None):
    r"""AlphaPy Main Program

    Notes
    -----
    (1) Initialize logging.
    (2) Parse the command line arguments.
    (3) Get the model configuration.
    (4) Create the model object.
    (5) Call the main AlphaPy pipeline.

    """

    # Logging

    logging.basicConfig(format="[%(asctime)s] %(levelname)s\t%(message)s",
                        filename="alphapy.log",
                        filemode='a',
                        level=logging.DEBUG,
                        datefmt='%m/%d/%y %H:%M:%S')
    formatter = logging.Formatter("[%(asctime)s] %(levelname)s\t%(message)s",
                                  datefmt='%m/%d/%y %H:%M:%S')
    console = logging.StreamHandler()
    console.setFormatter(formatter)
    console.setLevel(logging.INFO)
    logging.getLogger().addHandler(console)

    # Start the pipeline

    logger.info('*' * 80)
    logger.info("AlphaPy Start")
    logger.info('*' * 80)

    # Argument Parsing

    parser = argparse.ArgumentParser(description="AlphaPy Parser")
    parser.add_mutually_exclusive_group(required=False)
    parser.add_argument('--predict', dest='predict_mode', action='store_true')
    parser.add_argument('--train', dest='predict_mode', action='store_false')
    parser.set_defaults(predict_mode=False)
    args = parser.parse_args()

    # Read configuration file

    specs = get_model_config()
    specs['predict_mode'] = args.predict_mode

    # Create directories if necessary

    output_dirs = ['config', 'data', 'input', 'model', 'output', 'plots']
    for od in output_dirs:
        output_dir = SSEP.join([specs['directory'], od])
        if not os.path.exists(output_dir):
            logger.info("Creating directory %s", output_dir)
            os.makedirs(output_dir)

    # Create a model from the arguments

    logger.info("Creating Model")
    model = Model(specs)

    # Start the pipeline

    logger.info("Calling Pipeline")
    model = main_pipeline(model)

    # Complete the pipeline

    logger.info('*' * 80)
    logger.info("AlphaPy End")
    logger.info('*' * 80)
예제 #27
0
def write_plot(vizlib, plot, plot_type, tag, directory=None):
    r"""Save the plot to a file, or display it interactively.

    Parameters
    ----------
    vizlib : str
        The visualization library: ``'matplotlib'``, ``'seaborn'``,
        or ``'bokeh'``.
    plot : module
        Plotting context, e.g., ``plt``.
    plot_type : str
        Type of plot to generate.
    tag : str
        Unique identifier for the plot.
    directory : str, optional
        The full specification for the directory location. if
        ``directory`` is *None*, then the plot is displayed
        interactively.

    Returns
    -------
    None : None.

    Raises
    ------
    ValueError
        Unrecognized data visualization library.

    References
    ----------

    Visualization Libraries:

    * Matplotlib : http://matplotlib.org/
    * Seaborn : https://seaborn.pydata.org/
    * Bokeh : http://bokeh.pydata.org/en/latest/

    """

    # Validate visualization library

    if (vizlib == 'matplotlib' or
       vizlib == 'seaborn' or
       vizlib == 'bokeh'):
        # supported library
        pass
    elif vizlib == 'plotly':
        raise ValueError("Unsupported data visualization library: %s" % vizlib)
    else:
        raise ValueError("Unrecognized data visualization library: %s" % vizlib)

    # Save or display the plot

    if directory:
        if vizlib == 'bokeh':
            file_only = ''.join([plot_type, USEP, tag, '.html'])
        else:
            file_only = ''.join([plot_type, USEP, tag, '.png'])
        file_all = SSEP.join([directory, file_only])
        logger.info("Writing plot to %s", file_all)
        if vizlib == 'matplotlib':
            plot.savefig(file_all)
        elif vizlib == 'seaborn':
            plot.savefig(file_all)
        else:
            output_file(file_all, title=tag)
            show(plot)
    else:
        if vizlib == 'bokeh':
            show(plot)
        else:
            plot.plot()
예제 #28
0
def run_system(model, system, group, quantity=1):
    r"""Run a system for a given group, creating a trades frame.

    Parameters
    ----------
    model : alphapy.Model
        The model object with specifications.
    system : alphapy.System or str
        The system to run, either a long/short system or a local one
        identified by function name, e.g., 'open_range_breakout'.
    group : alphapy.Group
        The group of symbols to test.
    quantity : float
        The amount to trade for each symbol, e.g., number of shares

    Returns
    -------
    tf : pandas.DataFrame
        All of the trades for this ``group``.

    """

    if system.__class__ == str:
        system_name = system
    else:
        system_name = system.name

    logger.info("Generating Trades for System %s", system_name)

    # Unpack the model data.

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Extract the group information.

    gname = group.name
    gmembers = group.members
    gspace = group.space

    # Run the system for each member of the group

    gtlist = []
    for symbol in gmembers:
        # generate the trades for this member
        if system.__class__ == str:
            try:
                tlist = globals()[system_name](symbol, gspace, quantity)
            except:
                logger.info("Could not execute system for %s", symbol)
        else:
            # call default long/short system
            tlist = long_short(system, symbol, gspace, quantity)
        if tlist:
            # create the local trades frame
            df = DataFrame.from_items(tlist,
                                      orient='index',
                                      columns=Trade.states)
            # add trades to global trade list
            for item in tlist:
                gtlist.append(item)
        else:
            logger.info("No trades for symbol %s", symbol)

    # Create group trades frame

    tf = None
    if gtlist:
        tspace = Space(system_name, "trades", group.space.fractal)
        gtlist = sorted(gtlist, key=lambda x: x[0])
        tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states)
        tfname = frame_name(gname, tspace)
        system_dir = SSEP.join([directory, 'systems'])
        write_frame(tf, system_dir, tfname, extension, separator, index=True)
        del tspace
    else:
        logger.info("No trades were found")

    # Return trades frame
    return tf
예제 #29
0
def trade_system(model, system, space, intraday, name, quantity):
    r"""Trade the given system.

    Parameters
    ----------
    model : alphapy.Model
        The model object with specifications.
    system : alphapy.System
        The long/short system to run.
    space : alphapy.Space
        Namespace of instrument prices.
    intraday : bool
        If True, then run an intraday system.
    name : str
        The symbol to trade.
    quantity : float
        The amount of the ``name`` to trade, e.g., number of shares

    Returns
    -------
    tradelist : list
        List of trade entries and exits.

    Other Parameters
    ----------------
    Frame.frames : dict
        All of the data frames containing price data.

    """

    # Unpack the model data.

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Unpack the system parameters.

    longentry = system.longentry
    shortentry = system.shortentry
    longexit = system.longexit
    shortexit = system.shortexit
    holdperiod = system.holdperiod
    scale = system.scale

    # Determine whether or not this is a model-driven system.

    entries_and_exits = [longentry, shortentry, longexit, shortexit]
    active_signals = [x for x in entries_and_exits if x is not None]
    use_model = False
    for signal in active_signals:
        if any(x in signal for x in ['phigh', 'plow']):
            use_model = True

    # Read in the price frame
    pf = Frame.frames[frame_name(name, space)].df

    # Use model output probabilities as input to the system

    if use_model:
        # get latest probabilities file
        probs_dir = SSEP.join([directory, 'output'])
        file_path = most_recent_file(probs_dir, 'probabilities*')
        file_name = file_path.split(SSEP)[-1].split('.')[0]
        # read the probabilities frame and trim the price frame
        probs_frame = read_frame(probs_dir, file_name, extension, separator)
        pf = pf[-probs_frame.shape[0]:]
        probs_frame.index = pf.index
        probs_frame.columns = ['probability']
        # add probability column to price frame
        pf = pd.concat([pf, probs_frame], axis=1)

    # Evaluate the long and short events in the price frame

    for signal in active_signals:
        vexec(pf, signal)

    # Initialize trading state variables

    inlong = False
    inshort = False
    h = 0
    p = 0
    q = quantity
    tradelist = []

    # Loop through prices and generate trades

    for dt, row in pf.iterrows():
        # get closing price
        c = row['close']
        if intraday:
            bar_number = row['bar_number']
            end_of_day = row['end_of_day']
        # evaluate entry and exit conditions
        lerow = row[longentry] if longentry else None
        serow = row[shortentry] if shortentry else None
        lxrow = row[longexit] if longexit else None
        sxrow = row[shortexit] if shortexit else None
        # process the long and short events
        if lerow:
            if p < 0:
                # short active, so exit short
                tradelist.append((dt, [name, Orders.sx, -p, c]))
                inshort = False
                h = 0
                p = 0
            if p == 0 or scale:
                # go long (again)
                tradelist.append((dt, [name, Orders.le, q, c]))
                inlong = True
                p = p + q
        elif serow:
            if p > 0:
                # long active, so exit long
                tradelist.append((dt, [name, Orders.lx, -p, c]))
                inlong = False
                h = 0
                p = 0
            if p == 0 or scale:
                # go short (again)
                tradelist.append((dt, [name, Orders.se, -q, c]))
                inshort = True
                p = p - q
        # check exit conditions
        if inlong and h > 0 and lxrow:
            # long active, so exit long
            tradelist.append((dt, [name, Orders.lx, -p, c]))
            inlong = False
            h = 0
            p = 0
        if inshort and h > 0 and sxrow:
            # short active, so exit short
            tradelist.append((dt, [name, Orders.sx, -p, c]))
            inshort = False
            h = 0
            p = 0
        # if a holding period was given, then check for exit
        if holdperiod and h >= holdperiod:
            if inlong:
                tradelist.append((dt, [name, Orders.lh, -p, c]))
                inlong = False
            if inshort:
                tradelist.append((dt, [name, Orders.sh, -p, c]))
                inshort = False
            h = 0
            p = 0
        # increment the hold counter
        if inlong or inshort:
            h += 1
            if intraday and end_of_day:
                if inlong:
                    # long active, so exit long
                    tradelist.append((dt, [name, Orders.lx, -p, c]))
                    inlong = False
                if inshort:
                    # short active, so exit short
                    tradelist.append((dt, [name, Orders.sx, -p, c]))
                    inshort = False
                h = 0
                p = 0
    return tradelist
예제 #30
0
def run_system(model, system, group, intraday=False, quantity=1):
    r"""Run a system for a given group, creating a trades frame.

    Parameters
    ----------
    model : alphapy.Model
        The model object with specifications.
    system : alphapy.System
        The system to run.
    group : alphapy.Group
        The group of symbols to trade.
    intraday : bool, optional
        If true, this is an intraday system.
    quantity : float, optional
        The amount to trade for each symbol, e.g., number of shares

    Returns
    -------
    tf : pandas.DataFrame
        All of the trades for this ``group``.

    """

    system_name = system.name
    logger.info("Generating Trades for System %s", system_name)

    # Unpack the model data.

    directory = model.specs['directory']
    extension = model.specs['extension']
    separator = model.specs['separator']

    # Extract the group information.

    gname = group.name
    gmembers = group.members
    gspace = group.space

    # Run the system for each member of the group

    gtlist = []
    for symbol in gmembers:
        # generate the trades for this member
        tlist = trade_system(model, system, gspace, intraday, symbol, quantity)
        if tlist:
            # add trades to global trade list
            for item in tlist:
                gtlist.append(item)
        else:
            logger.info("No trades for symbol %s", symbol)

    # Create group trades frame

    tf = None
    if gtlist:
        tspace = Space(system_name, "trades", group.space.fractal)
        gtlist = sorted(gtlist, key=lambda x: x[0])
        tf = DataFrame.from_items(gtlist, orient='index', columns=Trade.states)
        tfname = frame_name(gname, tspace)
        system_dir = SSEP.join([directory, 'systems'])
        labels = ['date']
        if intraday:
            labels.append('time')
        write_frame(tf,
                    system_dir,
                    tfname,
                    extension,
                    separator,
                    index=True,
                    index_label=labels)
        del tspace
    else:
        logger.info("No trades were found")

    # Return trades frame
    return tf