def train_dag(dag, train_data, sample_weight=None): models = dict() data_cache = dict() # happens inside booster isarray_0 = isinstance(train_data[0], np.ndarray) isarray_1 = isinstance(train_data[1], np.ndarray) if isarray_0 and isarray_1: train_data = (pd.DataFrame(train_data[0]), pd.Series(train_data[1])) data_cache[dag['input'][2]] = train_data models['input'] = True def unfinished_models(): return [m for m in dag if m not in models] def data_available(): return [m for m in dag if data_ready(dag[m][0], data_cache)] def next_methods(): return [m for m in unfinished_models() if m in data_available()] while next_methods(): for m in next_methods(): # obtain the data features, targets, *rest = get_data(dag[m][0], data_cache) if rest: sample_weight = rest[0] ModelClass, model_params = utils.get_model_by_name(dag[m][1]) for p in model_params: if model_params[p] == 'True': model_params[p] = True elif model_params[p] == 'False': model_params[p] = False elif model_params[p] == 'None': model_params[p] = None out_name = dag[m][2] if dag[m][1][0] == 'stacker': sub_dags, initial_dag, input_data = \ dag_parser.extract_subgraphs(dag, m) model_params = dict(sub_dags=sub_dags, initial_dag=initial_dag) model = ModelClass(**model_params) features, targets = data_cache[input_data] elif isinstance(out_name, list): model = ModelClass(len(out_name), **model_params) else: model = ModelClass(**model_params) # build the model # some models cannot handle cases with only one class, we also need to check we are not working with a list # of inputs for an aggregator if custom_models.is_predictor(model) and isinstance( targets, pd.Series) and len(targets.unique()) == 1: model = custom_models.ConstantModel(targets.iloc[0]) models[m] = fit_model(model, features, targets, sample_weight=sample_weight) # needed to update model if the result was cached model = models[m] # use the model to process the data if isinstance(model, custom_models.Stacker): data_cache[out_name] = model.train, targets.ix[ model.train.index] continue if isinstance(model, custom_models.Aggregator): data_cache[out_name] = model.aggregate(features, targets) continue if custom_models.is_transformer(model): trans = model.transform(features) else: # this is a classifier not a preprocessor trans = features # the data do not change if isinstance(features, pd.DataFrame): targets = pd.Series(list(model.predict(features)), index=features.index) else: # this should happen only inside booster targets = pd.Series(list(model.predict(features))) # save the outputs # the previous model divided the data into several data-sets if isinstance(trans, list): if isinstance(model, custom_models.KMeansSplitter ) and sample_weight is not None: trans = [(x, targets.loc[x.index], sample_weight[model.weight_idx[i]]) for i, x in enumerate(trans) ] # need to divide the targets and the weights else: trans = [(x, targets.loc[x.index]) for x in trans] # need to divide the targets for i in range(len(trans)): # save all the data to the cache data_cache[out_name[i]] = trans[i] else: if isinstance(features, pd.DataFrame): # we have only one output, can be numpy array trans = pd.DataFrame(trans, index=features.index) else: trans = pd.DataFrame(trans) trans.dropna(axis='columns', how='all', inplace=True) data_cache[out_name] = (trans, targets) # save it return models
def train_dag(dag, train_data, sample_weight=None): models = dict() data_cache = dict() if isinstance(train_data[0], np.ndarray) and isinstance(train_data[1], np.ndarray): # happens inside booster train_data = (pd.DataFrame(train_data[0]), pd.Series(train_data[1])) data_cache[dag['input'][2]] = train_data models['input'] = True unfinished_models = lambda: [m for m in dag if m not in models] data_available = lambda: [m for m in dag if data_ready(dag[m][0], data_cache)] next_methods = lambda: [m for m in unfinished_models() if m in data_available()] while next_methods(): for m in next_methods(): # print("Processing:", m) # obtain the data features, targets, *rest = get_data(dag[m][0], data_cache) if rest: sample_weight = rest[0] ModelClass, model_params = utils.get_model_by_name(dag[m][1]) out_name = dag[m][2] if dag[m][1][0] == 'stacker': sub_dags, initial_dag, input_data = extract_subgraphs(dag, m) model_params = dict(sub_dags=sub_dags, initial_dag=initial_dag) model = ModelClass(**model_params) features, targets = data_cache[input_data] elif isinstance(out_name, list): model = ModelClass(len(out_name), **model_params) else: if isinstance(ModelClass(), feature_selection.SelectKBest): if 'feat_frac' not in model_params: model_params['feat_frac'] = 1.0 model_params = model_params.copy() model_params['k'] = max(1, int(model_params['feat_frac']*(features.shape[1]-1))) del model_params['feat_frac'] if isinstance(ModelClass(), decomposition.PCA): if 'feat_frac' not in model_params: model_params['feat_frac'] = 1.0 model_params = model_params.copy() model_params['n_components'] = max(1, int(model_params['feat_frac']*(features.shape[1]-1))) del model_params['feat_frac'] model = ModelClass(**model_params) # build the model # some models cannot handle cases with only one class, we also need to check we are not working with a list # of inputs for an aggregator if custom_models.is_predictor(model) and isinstance(targets, pd.Series) and len(targets.unique()) == 1: model = custom_models.ConstantModel(targets.iloc[0]) models[m] = fit_model(model, features, targets, sample_weight=sample_weight) model = models[m] # needed to update model if the result was cached # use the model to process the data if isinstance(model, custom_models.Stacker): data_cache[out_name] = model.train, targets.ix[model.train.index] continue if isinstance(model, custom_models.Aggregator): data_cache[out_name] = model.aggregate(features, targets) continue if custom_models.is_transformer(model): trans = model.transform(features) else: # this is a classifier not a preprocessor trans = features # the data do not change if isinstance(features, pd.DataFrame): targets = pd.Series(list(model.predict(features)), index=features.index) else: # this should happen only inside booster targets = pd.Series(list(model.predict(features))) # save the outputs if isinstance(trans, list): # the previous model divided the data into several data-sets if isinstance(model, custom_models.KMeansSplitter) and sample_weight is not None: trans = [(x, targets.loc[x.index], sample_weight[model.weight_idx[i]]) for i, x in enumerate(trans)] # need to divide the targets and the weights else: trans = [(x, targets.loc[x.index]) for x in trans] # need to divide the targets for i in range(len(trans)): data_cache[out_name[i]] = trans[i] # save all the data to the cache else: if isinstance(features, pd.DataFrame): trans = pd.DataFrame(trans, index=features.index) # we have only one output, can be numpy array else: trans = pd.DataFrame(trans) trans.dropna(axis='columns', how='all', inplace=True) data_cache[out_name] = (trans, targets) # save it return models
def test_dag(dag, models, test_data, output='preds_only'): data_cache = dict() finished = dict() if isinstance(test_data[0], np.ndarray): test_data = (pd.DataFrame(test_data[0]), test_data[1]) if isinstance(test_data[1], np.ndarray): test_data = (test_data[0], pd.Series(test_data[1], index=test_data[0].index)) data_cache[dag['input'][2]] = test_data finished['input'] = True def unfinished_models(): return [m for m in dag if m not in finished] def data_available(): return [m for m in dag if data_ready(dag[m][0], data_cache)] def next_methods(): return [m for m in unfinished_models() if m in data_available()] while next_methods(): for m in next_methods(): # obtain the data features, targets = get_data(dag[m][0], data_cache) model = models[m] out_name = dag[m][2] # we got empty dataset (after same division) if isinstance(features, pd.DataFrame) and features.empty: # and we should divide it further if isinstance(out_name, list): for o in out_name: data_cache[o] = (features, targets) else: data_cache[out_name] = (features, targets) finished[m] = True continue # use the model to process the data if isinstance(model, custom_models.Aggregator): data_cache[out_name] = model.aggregate(features, targets) finished[m] = True continue elif custom_models.is_transformer(model): trans = model.transform(features) targets = pd.Series(targets, index=features.index) else: # this is a classifier not a preprocessor trans = features # the data do not change if isinstance(features, pd.DataFrame): targets = pd.Series(list(model.predict(features)), index=features.index) else: targets = pd.Series(list(model.predict(features))) # save the outputs # the previous model divided the data into several data-sets if isinstance(trans, list): trans = [(x, targets.loc[x.index]) for x in trans] # need to divide the targets for i in range(len(trans)): # save all the data to the cache data_cache[out_name[i]] = trans[i] else: if isinstance(features, pd.DataFrame): # we have only one output, can be numpy array trans = pd.DataFrame(trans, index=features.index) else: trans = pd.DataFrame(trans) trans.dropna(axis='columns', how='all', inplace=True) data_cache[out_name] = (trans, targets) # save it finished[m] = True if output == 'all': return data_cache['output'] if output == 'preds_only': return data_cache['output'][1] if output == 'feats_only': return data_cache['output'][0] raise AttributeError(output, 'is not a valid output type')
def test_dag(dag, models, test_data, output='preds_only'): data_cache = dict() finished = dict() if isinstance(test_data[0], np.ndarray): test_data = (pd.DataFrame(test_data[0]), test_data[1]) if isinstance(test_data[1], np.ndarray): test_data = (test_data[0], pd.Series(test_data[1], index=test_data[0].index)) data_cache[dag['input'][2]] = test_data finished['input'] = True unfinished_models = lambda: [m for m in dag if m not in finished] data_available = lambda: [m for m in dag if data_ready(dag[m][0], data_cache)] next_methods = lambda: [m for m in unfinished_models() if m in data_available()] while next_methods(): for m in next_methods(): # obtain the data features, targets = get_data(dag[m][0], data_cache) model = models[m] out_name = dag[m][2] if isinstance(features, pd.DataFrame) and features.empty: # we got empty dataset (after same division) if isinstance(out_name, list): # and we should divide it further for o in out_name: data_cache[o] = (features, targets) else: data_cache[out_name] = (features, targets) finished[m] = True continue # use the model to process the data if isinstance(model, custom_models.Aggregator): data_cache[out_name] = model.aggregate(features, targets) finished[m] = True continue elif custom_models.is_transformer(model): trans = model.transform(features) targets = pd.Series(targets, index=features.index) else: # this is a classifier not a preprocessor trans = features # the data do not change if isinstance(features, pd.DataFrame): targets = pd.Series(list(model.predict(features)), index=features.index) else: targets = pd.Series(list(model.predict(features))) # save the outputs if isinstance(trans, list): # the previous model divided the data into several data-sets trans = [(x, targets.loc[x.index]) for x in trans] # need to divide the targets for i in range(len(trans)): data_cache[out_name[i]] = trans[i] # save all the data to the cache else: if isinstance(features, pd.DataFrame): trans = pd.DataFrame(trans, index=features.index) # we have only one output, can be numpy array else: trans = pd.DataFrame(trans) trans.dropna(axis='columns', how='all', inplace=True) data_cache[out_name] = (trans, targets) # save it finished[m] = True if output == 'all': return data_cache['output'] if output == 'preds_only': return data_cache['output'][1] if output == 'feats_only': return data_cache['output'][0] raise AttributeError(output, 'is not a valid output type')