コード例 #1
0
def train_dag(dag, train_data, sample_weight=None):
    models = dict()
    data_cache = dict()

    # happens inside booster
    isarray_0 = isinstance(train_data[0], np.ndarray)
    isarray_1 = isinstance(train_data[1], np.ndarray)
    if isarray_0 and isarray_1:
        train_data = (pd.DataFrame(train_data[0]), pd.Series(train_data[1]))

    data_cache[dag['input'][2]] = train_data
    models['input'] = True

    def unfinished_models():
        return [m for m in dag if m not in models]

    def data_available():
        return [m for m in dag if data_ready(dag[m][0], data_cache)]

    def next_methods():
        return [m for m in unfinished_models() if m in data_available()]

    while next_methods():

        for m in next_methods():
            # obtain the data
            features, targets, *rest = get_data(dag[m][0], data_cache)
            if rest:
                sample_weight = rest[0]
            ModelClass, model_params = utils.get_model_by_name(dag[m][1])

            for p in model_params:
                if model_params[p] == 'True':
                    model_params[p] = True
                elif model_params[p] == 'False':
                    model_params[p] = False
                elif model_params[p] == 'None':
                    model_params[p] = None

            out_name = dag[m][2]
            if dag[m][1][0] == 'stacker':
                sub_dags, initial_dag, input_data = \
                    dag_parser.extract_subgraphs(dag, m)
                model_params = dict(sub_dags=sub_dags, initial_dag=initial_dag)
                model = ModelClass(**model_params)
                features, targets = data_cache[input_data]
            elif isinstance(out_name, list):
                model = ModelClass(len(out_name), **model_params)
            else:
                model = ModelClass(**model_params)

            # build the model
            # some models cannot handle cases with only one class, we also need to check we are not working with a list
            # of inputs for an aggregator
            if custom_models.is_predictor(model) and isinstance(
                    targets, pd.Series) and len(targets.unique()) == 1:
                model = custom_models.ConstantModel(targets.iloc[0])
            models[m] = fit_model(model,
                                  features,
                                  targets,
                                  sample_weight=sample_weight)
            # needed to update model if the result was cached
            model = models[m]

            # use the model to process the data
            if isinstance(model, custom_models.Stacker):
                data_cache[out_name] = model.train, targets.ix[
                    model.train.index]
                continue
            if isinstance(model, custom_models.Aggregator):
                data_cache[out_name] = model.aggregate(features, targets)
                continue
            if custom_models.is_transformer(model):
                trans = model.transform(features)
            else:  # this is a classifier not a preprocessor
                trans = features  # the data do not change
                if isinstance(features, pd.DataFrame):
                    targets = pd.Series(list(model.predict(features)),
                                        index=features.index)
                else:  # this should happen only inside booster
                    targets = pd.Series(list(model.predict(features)))

            # save the outputs
            # the previous model divided the data into several data-sets
            if isinstance(trans, list):
                if isinstance(model, custom_models.KMeansSplitter
                              ) and sample_weight is not None:
                    trans = [(x, targets.loc[x.index],
                              sample_weight[model.weight_idx[i]])
                             for i, x in enumerate(trans)
                             ]  # need to divide the targets and the weights
                else:
                    trans = [(x, targets.loc[x.index])
                             for x in trans]  # need to divide the targets
                for i in range(len(trans)):
                    # save all the data to the cache
                    data_cache[out_name[i]] = trans[i]
            else:
                if isinstance(features, pd.DataFrame):
                    # we have only one output, can be numpy array
                    trans = pd.DataFrame(trans, index=features.index)
                else:
                    trans = pd.DataFrame(trans)
                trans.dropna(axis='columns', how='all', inplace=True)
                data_cache[out_name] = (trans, targets)  # save it

    return models
コード例 #2
0
ファイル: eval.py プロジェクト: martinpilat/dag-evaluate
def train_dag(dag, train_data, sample_weight=None):
    models = dict()
    data_cache = dict()

    if isinstance(train_data[0], np.ndarray) and isinstance(train_data[1], np.ndarray): # happens inside booster
        train_data = (pd.DataFrame(train_data[0]), pd.Series(train_data[1]))

    data_cache[dag['input'][2]] = train_data
    models['input'] = True

    unfinished_models = lambda: [m for m in dag if m not in models]
    data_available = lambda: [m for m in dag if data_ready(dag[m][0], data_cache)]
    next_methods = lambda: [m for m in unfinished_models() if m in data_available()]

    while next_methods():

        for m in next_methods():
            # print("Processing:", m)

            # obtain the data
            features, targets, *rest = get_data(dag[m][0], data_cache)
            if rest:
                sample_weight = rest[0]
            ModelClass, model_params = utils.get_model_by_name(dag[m][1])
            out_name = dag[m][2]
            if dag[m][1][0] == 'stacker':
                sub_dags, initial_dag, input_data = extract_subgraphs(dag, m)
                model_params = dict(sub_dags=sub_dags, initial_dag=initial_dag)
                model = ModelClass(**model_params)
                features, targets = data_cache[input_data]
            elif isinstance(out_name, list):
                model = ModelClass(len(out_name), **model_params)
            else:
                if isinstance(ModelClass(), feature_selection.SelectKBest):
                    if 'feat_frac' not in model_params:
                        model_params['feat_frac'] = 1.0
                    model_params = model_params.copy()
                    model_params['k'] = max(1, int(model_params['feat_frac']*(features.shape[1]-1)))
                    del model_params['feat_frac']
                if isinstance(ModelClass(), decomposition.PCA):
                    if 'feat_frac' not in model_params:
                        model_params['feat_frac'] = 1.0
                    model_params = model_params.copy()
                    model_params['n_components'] = max(1, int(model_params['feat_frac']*(features.shape[1]-1)))
                    del model_params['feat_frac']
                model = ModelClass(**model_params)

            # build the model
            # some models cannot handle cases with only one class, we also need to check we are not working with a list
            # of inputs for an aggregator
            if custom_models.is_predictor(model) and isinstance(targets, pd.Series) and len(targets.unique()) == 1:
                model = custom_models.ConstantModel(targets.iloc[0])
            models[m] = fit_model(model, features, targets, sample_weight=sample_weight)
            model = models[m]  # needed to update model if the result was cached

            # use the model to process the data
            if isinstance(model, custom_models.Stacker):
                data_cache[out_name] = model.train, targets.ix[model.train.index]
                continue
            if isinstance(model, custom_models.Aggregator):
                data_cache[out_name] = model.aggregate(features, targets)
                continue
            if custom_models.is_transformer(model):
                trans = model.transform(features)
            else:              # this is a classifier not a preprocessor
                trans = features                # the data do not change
                if isinstance(features, pd.DataFrame):
                    targets = pd.Series(list(model.predict(features)), index=features.index)
                else: # this should happen only inside booster
                    targets = pd.Series(list(model.predict(features)))

            # save the outputs
            if isinstance(trans, list):         # the previous model divided the data into several data-sets
                if isinstance(model, custom_models.KMeansSplitter) and sample_weight is not None:
                    trans = [(x, targets.loc[x.index], sample_weight[model.weight_idx[i]]) for i, x in enumerate(trans)]  # need to divide the targets and the weights
                else:
                    trans = [(x, targets.loc[x.index]) for x in trans]     # need to divide the targets
                for i in range(len(trans)):
                    data_cache[out_name[i]] = trans[i]          # save all the data to the cache
            else:
                if isinstance(features, pd.DataFrame):
                    trans = pd.DataFrame(trans, index=features.index)       # we have only one output, can be numpy array
                else:
                    trans = pd.DataFrame(trans)
                trans.dropna(axis='columns', how='all', inplace=True)
                data_cache[out_name] = (trans, targets)                 # save it

    return models
コード例 #3
0
def test_dag(dag, models, test_data, output='preds_only'):
    data_cache = dict()
    finished = dict()

    if isinstance(test_data[0], np.ndarray):
        test_data = (pd.DataFrame(test_data[0]), test_data[1])

    if isinstance(test_data[1], np.ndarray):
        test_data = (test_data[0],
                     pd.Series(test_data[1], index=test_data[0].index))

    data_cache[dag['input'][2]] = test_data
    finished['input'] = True

    def unfinished_models():
        return [m for m in dag if m not in finished]

    def data_available():
        return [m for m in dag if data_ready(dag[m][0], data_cache)]

    def next_methods():
        return [m for m in unfinished_models() if m in data_available()]

    while next_methods():

        for m in next_methods():

            # obtain the data
            features, targets = get_data(dag[m][0], data_cache)
            model = models[m]
            out_name = dag[m][2]

            # we got empty dataset (after same division)
            if isinstance(features, pd.DataFrame) and features.empty:
                # and we should divide it further
                if isinstance(out_name, list):
                    for o in out_name:
                        data_cache[o] = (features, targets)
                else:
                    data_cache[out_name] = (features, targets)
                finished[m] = True
                continue

            # use the model to process the data
            if isinstance(model, custom_models.Aggregator):
                data_cache[out_name] = model.aggregate(features, targets)
                finished[m] = True
                continue
            elif custom_models.is_transformer(model):
                trans = model.transform(features)
                targets = pd.Series(targets, index=features.index)
            else:  # this is a classifier not a preprocessor
                trans = features  # the data do not change
                if isinstance(features, pd.DataFrame):
                    targets = pd.Series(list(model.predict(features)),
                                        index=features.index)
                else:
                    targets = pd.Series(list(model.predict(features)))

            # save the outputs
            # the previous model divided the data into several data-sets
            if isinstance(trans, list):
                trans = [(x, targets.loc[x.index])
                         for x in trans]  # need to divide the targets
                for i in range(len(trans)):
                    # save all the data to the cache
                    data_cache[out_name[i]] = trans[i]
            else:
                if isinstance(features, pd.DataFrame):
                    # we have only one output, can be numpy array
                    trans = pd.DataFrame(trans, index=features.index)
                else:
                    trans = pd.DataFrame(trans)
                trans.dropna(axis='columns', how='all', inplace=True)
                data_cache[out_name] = (trans, targets)  # save it

            finished[m] = True

    if output == 'all':
        return data_cache['output']
    if output == 'preds_only':
        return data_cache['output'][1]
    if output == 'feats_only':
        return data_cache['output'][0]

    raise AttributeError(output, 'is not a valid output type')
コード例 #4
0
ファイル: eval.py プロジェクト: martinpilat/dag-evaluate
def test_dag(dag, models, test_data, output='preds_only'):
    data_cache = dict()
    finished = dict()

    if isinstance(test_data[0], np.ndarray):
        test_data = (pd.DataFrame(test_data[0]), test_data[1])

    if isinstance(test_data[1], np.ndarray):
        test_data = (test_data[0], pd.Series(test_data[1], index=test_data[0].index))

    data_cache[dag['input'][2]] = test_data
    finished['input'] = True

    unfinished_models = lambda: [m for m in dag if m not in finished]
    data_available = lambda: [m for m in dag if data_ready(dag[m][0], data_cache)]
    next_methods = lambda: [m for m in unfinished_models() if m in data_available()]

    while next_methods():

        for m in next_methods():

            # obtain the data
            features, targets = get_data(dag[m][0], data_cache)
            model = models[m]
            out_name = dag[m][2]

            if isinstance(features, pd.DataFrame) and features.empty:   # we got empty dataset (after same division)
                if isinstance(out_name, list):                          # and we should divide it further
                    for o in out_name:
                        data_cache[o] = (features, targets)
                else:
                    data_cache[out_name] = (features, targets)
                finished[m] = True
                continue

            # use the model to process the data
            if isinstance(model, custom_models.Aggregator):
                data_cache[out_name] = model.aggregate(features, targets)
                finished[m] = True
                continue
            elif custom_models.is_transformer(model):
                trans = model.transform(features)
                targets = pd.Series(targets, index=features.index)
            else:                                                       # this is a classifier not a preprocessor
                trans = features                                        # the data do not change
                if isinstance(features, pd.DataFrame):
                    targets = pd.Series(list(model.predict(features)), index=features.index)
                else:
                    targets = pd.Series(list(model.predict(features)))

            # save the outputs
            if isinstance(trans, list):                     # the previous model divided the data into several data-sets
                trans = [(x, targets.loc[x.index]) for x in trans]      # need to divide the targets
                for i in range(len(trans)):
                    data_cache[out_name[i]] = trans[i]                  # save all the data to the cache
            else:
                if isinstance(features, pd.DataFrame):
                    trans = pd.DataFrame(trans, index=features.index)       # we have only one output, can be numpy array
                else:
                    trans = pd.DataFrame(trans)
                trans.dropna(axis='columns', how='all', inplace=True)
                data_cache[out_name] = (trans, targets)                 # save it

            finished[m] = True

    if output == 'all':
        return data_cache['output']
    if output == 'preds_only':
        return data_cache['output'][1]
    if output == 'feats_only':
        return data_cache['output'][0]

    raise AttributeError(output, 'is not a valid output type')