def get_algos_config(cfg_dir): r"""Read the algorithms configuration file. Parameters ---------- cfg_dir : str The directory where the configuration file ``algos.yml`` is stored. Returns ------- specs : dict The specifications for determining which algorithms to run. """ logger.info("Algorithm Configuration") # Read the configuration file full_path = SSEP.join([cfg_dir, 'algos.yml']) with open(full_path, 'r') as ymlfile: specs = yaml.load(ymlfile, Loader=yaml.FullLoader) # Find optional packages find_optional_packages() # Ensure each algorithm has required keys minimum_keys = ['model_type', 'params', 'grid'] required_keys_keras = minimum_keys + ['layers', 'compiler'] for algo in specs: if 'KERAS' in algo: required_keys = required_keys_keras else: required_keys = minimum_keys algo_keys = list(specs[algo].keys()) if set(algo_keys) != set(required_keys): logger.warning("Algorithm %s has the wrong keys %s", algo, required_keys) logger.warning("Keys found instead: %s", algo_keys) else: # determine whether or not model type is valid model_types = {x.name: x.value for x in ModelType} model_type = specs[algo]['model_type'] if model_type in model_types: specs[algo]['model_type'] = ModelType(model_types[model_type]) else: raise ValueError("algos.yml model:type %s unrecognized" % model_type) # Algorithm Specifications return specs
def get_model_config(): r"""Read in the configuration file for AlphaPy. Parameters ---------- None : None Returns ------- specs : dict The parameters for controlling AlphaPy. Raises ------ ValueError Unrecognized value of a ``model.yml`` field. """ logger.info("Model Configuration") # Read the configuration file full_path = SSEP.join([PSEP, 'config', 'model.yml']) with open(full_path, 'r') as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) # Store configuration parameters in dictionary specs = {} # Section: project [this section must be first] specs['directory'] = cfg['project']['directory'] specs['extension'] = cfg['project']['file_extension'] specs['submission_file'] = cfg['project']['submission_file'] specs['submit_probas'] = cfg['project']['submit_probas'] # Section: data specs['drop'] = cfg['data']['drop'] specs['features'] = cfg['data']['features'] specs['sentinel'] = cfg['data']['sentinel'] specs['separator'] = cfg['data']['separator'] specs['shuffle'] = cfg['data']['shuffle'] specs['split'] = cfg['data']['split'] specs['target'] = cfg['data']['target'] specs['target_value'] = cfg['data']['target_value'] # sampling specs['sampling'] = cfg['data']['sampling']['option'] # determine whether or not sampling method is valid samplers = {x.name: x.value for x in SamplingMethod} sampling_method = cfg['data']['sampling']['method'] if sampling_method in samplers: specs['sampling_method'] = SamplingMethod(samplers[sampling_method]) else: raise ValueError("model.yml data:sampling:method %s unrecognized" % sampling_method) # end of sampling method specs['sampling_ratio'] = cfg['data']['sampling']['ratio'] # Section: features # clustering specs['clustering'] = cfg['features']['clustering']['option'] specs['cluster_min'] = cfg['features']['clustering']['minimum'] specs['cluster_max'] = cfg['features']['clustering']['maximum'] specs['cluster_inc'] = cfg['features']['clustering']['increment'] # counts specs['counts'] = cfg['features']['counts']['option'] # encoding specs['rounding'] = cfg['features']['encoding']['rounding'] # determine whether or not encoder is valid encoders = {x.name: x.value for x in Encoders} encoder = cfg['features']['encoding']['type'] if encoder in encoders: specs['encoder'] = Encoders(encoders[encoder]) else: raise ValueError("model.yml features:encoding:type %s unrecognized" % encoder) # factors specs['factors'] = cfg['features']['factors'] # interactions specs['interactions'] = cfg['features']['interactions']['option'] specs['isample_pct'] = cfg['features']['interactions']['sampling_pct'] specs['poly_degree'] = cfg['features']['interactions']['poly_degree'] # isomap specs['isomap'] = cfg['features']['isomap']['option'] specs['iso_components'] = cfg['features']['isomap']['components'] specs['iso_neighbors'] = cfg['features']['isomap']['neighbors'] # log transformation specs['logtransform'] = cfg['features']['logtransform']['option'] # low-variance features specs['lv_remove'] = cfg['features']['variance']['option'] specs['lv_threshold'] = cfg['features']['variance']['threshold'] # NumPy specs['numpy'] = cfg['features']['numpy']['option'] # pca specs['pca'] = cfg['features']['pca']['option'] specs['pca_min'] = cfg['features']['pca']['minimum'] specs['pca_max'] = cfg['features']['pca']['maximum'] specs['pca_inc'] = cfg['features']['pca']['increment'] specs['pca_whiten'] = cfg['features']['pca']['whiten'] # Scaling specs['scaler_option'] = cfg['features']['scaling']['option'] # determine whether or not scaling type is valid scaler_types = {x.name: x.value for x in Scalers} scaler_type = cfg['features']['scaling']['type'] if scaler_type in scaler_types: specs['scaler_type'] = Scalers(scaler_types[scaler_type]) else: raise ValueError("model.yml features:scaling:type %s unrecognized" % scaler_type) # SciPy specs['scipy'] = cfg['features']['scipy']['option'] # text specs['ngrams_max'] = cfg['features']['text']['ngrams'] specs['vectorize'] = cfg['features']['text']['vectorize'] # t-sne specs['tsne'] = cfg['features']['tsne']['option'] specs['tsne_components'] = cfg['features']['tsne']['components'] specs['tsne_learn_rate'] = cfg['features']['tsne']['learning_rate'] specs['tsne_perplexity'] = cfg['features']['tsne']['perplexity'] # Section: model specs['algorithms'] = cfg['model']['algorithms'] specs['cv_folds'] = cfg['model']['cv_folds'] # determine whether or not model type is valid model_types = {x.name: x.value for x in ModelType} model_type = cfg['model']['type'] if model_type in model_types: specs['model_type'] = ModelType(model_types[model_type]) else: raise ValueError("model.yml model:type %s unrecognized" % model_type) # end of model type specs['n_estimators'] = cfg['model']['estimators'] specs['pvalue_level'] = cfg['model']['pvalue_level'] specs['scorer'] = cfg['model']['scoring_function'] # calibration specs['calibration'] = cfg['model']['calibration']['option'] specs['cal_type'] = cfg['model']['calibration']['type'] # feature selection specs['feature_selection'] = cfg['model']['feature_selection']['option'] specs['fs_percentage'] = cfg['model']['feature_selection']['percentage'] specs['fs_uni_grid'] = cfg['model']['feature_selection']['uni_grid'] score_func = cfg['model']['feature_selection']['score_func'] if score_func in feature_scorers: specs['fs_score_func'] = feature_scorers[score_func] else: raise ValueError( "model.yml model:feature_selection:score_func %s unrecognized" % score_func) # grid search specs['grid_search'] = cfg['model']['grid_search']['option'] specs['gs_iters'] = cfg['model']['grid_search']['iterations'] specs['gs_random'] = cfg['model']['grid_search']['random'] specs['gs_sample'] = cfg['model']['grid_search']['subsample'] specs['gs_sample_pct'] = cfg['model']['grid_search']['sampling_pct'] # rfe specs['rfe'] = cfg['model']['rfe']['option'] specs['rfe_step'] = cfg['model']['rfe']['step'] # Section: pipeline specs['n_jobs'] = cfg['pipeline']['number_jobs'] specs['seed'] = cfg['pipeline']['seed'] specs['verbosity'] = cfg['pipeline']['verbosity'] # Section: plots specs['calibration_plot'] = cfg['plots']['calibration'] specs['confusion_matrix'] = cfg['plots']['confusion_matrix'] specs['importances'] = cfg['plots']['importances'] specs['learning_curve'] = cfg['plots']['learning_curve'] specs['roc_curve'] = cfg['plots']['roc_curve'] # Section: treatments try: specs['treatments'] = cfg['treatments'] except: specs['treatments'] = None logger.info("No Treatments Found") # Section: xgboost specs['esr'] = cfg['xgboost']['stopping_rounds'] # Log the configuration parameters logger.info('MODEL PARAMETERS:') logger.info('algorithms = %s', specs['algorithms']) logger.info('calibration = %r', specs['calibration']) logger.info('cal_type = %s', specs['cal_type']) logger.info('calibration_plot = %r', specs['calibration']) logger.info('clustering = %r', specs['clustering']) logger.info('cluster_inc = %d', specs['cluster_inc']) logger.info('cluster_max = %d', specs['cluster_max']) logger.info('cluster_min = %d', specs['cluster_min']) logger.info('confusion_matrix = %r', specs['confusion_matrix']) logger.info('counts = %r', specs['counts']) logger.info('cv_folds = %d', specs['cv_folds']) logger.info('directory = %s', specs['directory']) logger.info('extension = %s', specs['extension']) logger.info('drop = %s', specs['drop']) logger.info('encoder = %r', specs['encoder']) logger.info('esr = %d', specs['esr']) logger.info('factors = %s', specs['factors']) logger.info('features [X] = %s', specs['features']) logger.info('feature_selection = %r', specs['feature_selection']) logger.info('fs_percentage = %d', specs['fs_percentage']) logger.info('fs_score_func = %s', specs['fs_score_func']) logger.info('fs_uni_grid = %s', specs['fs_uni_grid']) logger.info('grid_search = %r', specs['grid_search']) logger.info('gs_iters = %d', specs['gs_iters']) logger.info('gs_random = %r', specs['gs_random']) logger.info('gs_sample = %r', specs['gs_sample']) logger.info('gs_sample_pct = %f', specs['gs_sample_pct']) logger.info('importances = %r', specs['importances']) logger.info('interactions = %r', specs['interactions']) logger.info('isomap = %r', specs['isomap']) logger.info('iso_components = %d', specs['iso_components']) logger.info('iso_neighbors = %d', specs['iso_neighbors']) logger.info('isample_pct = %d', specs['isample_pct']) logger.info('learning_curve = %r', specs['learning_curve']) logger.info('logtransform = %r', specs['logtransform']) logger.info('lv_remove = %r', specs['lv_remove']) logger.info('lv_threshold = %f', specs['lv_threshold']) logger.info('model_type = %r', specs['model_type']) logger.info('n_estimators = %d', specs['n_estimators']) logger.info('n_jobs = %d', specs['n_jobs']) logger.info('ngrams_max = %d', specs['ngrams_max']) logger.info('numpy = %r', specs['numpy']) logger.info('pca = %r', specs['pca']) logger.info('pca_inc = %d', specs['pca_inc']) logger.info('pca_max = %d', specs['pca_max']) logger.info('pca_min = %d', specs['pca_min']) logger.info('pca_whiten = %r', specs['pca_whiten']) logger.info('poly_degree = %d', specs['poly_degree']) logger.info('pvalue_level = %f', specs['pvalue_level']) logger.info('rfe = %r', specs['rfe']) logger.info('rfe_step = %d', specs['rfe_step']) logger.info('roc_curve = %r', specs['roc_curve']) logger.info('rounding = %d', specs['rounding']) logger.info('sampling = %r', specs['sampling']) logger.info('sampling_method = %r', specs['sampling_method']) logger.info('sampling_ratio = %f', specs['sampling_ratio']) logger.info('scaler_option = %r', specs['scaler_option']) logger.info('scaler_type = %r', specs['scaler_type']) logger.info('scipy = %r', specs['scipy']) logger.info('scorer = %s', specs['scorer']) logger.info('seed = %d', specs['seed']) logger.info('sentinel = %d', specs['sentinel']) logger.info('separator = %s', specs['separator']) logger.info('shuffle = %r', specs['shuffle']) logger.info('split = %f', specs['split']) logger.info('submission_file = %s', specs['submission_file']) logger.info('submit_probas = %r', specs['submit_probas']) logger.info('target [y] = %s', specs['target']) logger.info('target_value = %d', specs['target_value']) logger.info('treatments = %s', specs['treatments']) logger.info('tsne = %r', specs['tsne']) logger.info('tsne_components = %d', specs['tsne_components']) logger.info('tsne_learn_rate = %f', specs['tsne_learn_rate']) logger.info('tsne_perplexity = %f', specs['tsne_perplexity']) logger.info('vectorize = %r', specs['vectorize']) logger.info('verbosity = %d', specs['verbosity']) # Specifications to create the model return specs