Ejemplo n.º 1
0
def _fit_local(params,
               model_factory,
               list_of_parts,
               worker_addresses,
               return_model,
               local_listen_port=12400,
               listen_time_out=120,
               **kwargs):
    network_params = build_network_params(worker_addresses,
                                          get_worker().address,
                                          local_listen_port, listen_time_out)
    params = {**params, **network_params}

    # Prepare data
    if len(list_of_parts[0]) == 3:
        data, labels, weight = zip(*list_of_parts)
        weight = concat(weight)
    else:
        data, labels = zip(*list_of_parts)
        weight = None

    data = concat(data)  # Concatenate many parts into one
    labels = concat(labels)

    try:
        classifier = model_factory(**params)
        classifier.fit(data, labels, sample_weight=weight)
    finally:
        _safe_call(_LIB.LGBM_NetworkFree())

    if return_model:
        return classifier
    else:
        return None
Ejemplo n.º 2
0
def _train_part(params,
                model_factory,
                list_of_parts,
                worker_addresses,
                return_model,
                local_listen_port=12400,
                time_out=120,
                **kwargs):

    network_params = build_network_params(worker_addresses,
                                          get_worker().address,
                                          local_listen_port, time_out)
    params.update(network_params)

    # Concatenate many parts into one
    parts = tuple(zip(*list_of_parts))
    data = concat(parts[0])
    label = concat(parts[1])
    weight = concat(parts[2]) if len(parts) == 3 else None

    try:
        model = model_factory(**params)
        model.fit(data, label, sample_weight=weight)
    finally:
        _safe_call(_LIB.LGBM_NetworkFree())

    return model if return_model else None
Ejemplo n.º 3
0
def _train_part(params,
                model_factory,
                list_of_parts,
                worker_addresses,
                return_model,
                local_listen_port=12400,
                time_out=120,
                **kwargs):

    network_params = build_network_params(worker_addresses,
                                          get_worker().address,
                                          local_listen_port, time_out)
    params.update(network_params)

    # Concatenate many parts into one
    parts = tuple(zip(*list_of_parts))
    data = concat(parts[kwargs['parts_list'].index('X')])
    label = concat(parts[kwargs['parts_list'].index('y')])
    weight = concat(parts[kwargs['parts_list'].index(
        'weight')]) if 'weight' in kwargs['parts_list'] else None
    valid_X = concat(parts[kwargs['parts_list'].index(
        'valid_X')]) if 'valid_X' in kwargs['parts_list'] else None
    valid_y = concat(parts[kwargs['parts_list'].index(
        'valid_y')]) if 'valid_y' in kwargs['parts_list'] else None
    eval_sample_weight = concat(
        parts[kwargs['parts_list'].index('eval_sample_weight')]
    ) if 'eval_sample_weight' in kwargs['parts_list'] else None
    # only first eval_set supported
    kwargs = kwargs.copy()  # avoid contaminating upstream
    if valid_X is not None and valid_y is not None:
        kwargs['eval_set'] = [(valid_X, valid_y)]
        kwargs['eval_sample_weight'] = [eval_sample_weight]
    kwargs.pop('parts_list', None)

    try:
        model = model_factory(**params)
        model.fit(data, label, sample_weight=weight, **kwargs)
    finally:
        _safe_call(_LIB.LGBM_NetworkFree())

    return model if return_model else None
Ejemplo n.º 4
0
def _fit_local(params,
               model_factory,
               list_of_parts,
               worker_addresses,
               local_listen_port=12400,
               listen_time_out=120,
               **kwargs):
    network_params = build_network_params(worker_addresses,
                                          get_worker().address,
                                          local_listen_port, listen_time_out)
    params = {**params, **network_params}

    data, labels = zip(*list_of_parts)  # Prepare data
    data = concat(data)  # Concatenate many parts into one
    labels = concat(labels)

    try:
        classifier = model_factory(**params)
        classifier.fit(data, labels)
    finally:
        _safe_call(_LIB.LGBM_NetworkFree())
    return classifier
Ejemplo n.º 5
0
    def execute(cls, ctx, op: "LGBMTrain"):
        if op.merge:
            return super().execute(ctx, op)

        from lightgbm.basic import _safe_call, _LIB

        data_val = ctx[op.data.key]
        data_val = data_val.spmatrix if hasattr(data_val,
                                                'spmatrix') else data_val

        label_val = ctx[op.label.key]
        sample_weight_val = ctx[
            op.sample_weight.key] if op.sample_weight is not None else None
        init_score_val = ctx[
            op.init_score.key] if op.init_score is not None else None

        if op.eval_datas is None:
            eval_set, eval_sample_weight, eval_init_score = None, None, None
        else:
            eval_set, eval_sample_weight, eval_init_score = [], [], []
            for data, label in zip(op.eval_datas, op.eval_labels):
                data_eval = ctx[data.key]
                data_eval = data_eval.spmatrix if hasattr(
                    data_eval, 'spmatrix') else data_eval
                eval_set.append((data_eval, ctx[label.key]))
            for weight in op.eval_sample_weights:
                eval_sample_weight.append(
                    ctx[weight.key] if weight is not None else None)
            for score in op.eval_init_scores:
                eval_init_score.append(
                    ctx[score.key] if score is not None else None)

            eval_set = eval_set or None
            eval_sample_weight = eval_sample_weight or None
            eval_init_score = eval_init_score or None

        params = op.params.copy()
        # if model is trained, remove unsupported parameters
        params.pop('out_dtype_', None)
        if ctx.running_mode == RunningMode.distributed:
            worker_ports = ctx[op.worker_ports.key]
            worker_ips = [worker.split(':', 1)[0] for worker in op.workers]
            worker_endpoints = [
                f'{worker}:{port}'
                for worker, port in zip(worker_ips, worker_ports)
            ]

            params['machines'] = ','.join(worker_endpoints)
            params['time_out'] = op.timeout
            params['num_machines'] = len(worker_endpoints)
            params['local_listen_port'] = worker_ports[op.worker_id]

            if (op.tree_learner
                    or '').lower() not in {'data', 'feature', 'voting'}:
                logger.warning(
                    'Parameter tree_learner not set or set to incorrect value '
                    f'{op.tree_learner}, using "data" as default')
                params['tree_learner'] = 'data'
            else:
                params['tree_learner'] = op.tree_learner

        try:
            model_cls = get_model_cls_from_type(op.model_type)
            model = model_cls(**params)
            model.fit(data_val,
                      label_val,
                      sample_weight=sample_weight_val,
                      init_score=init_score_val,
                      eval_set=eval_set,
                      eval_sample_weight=eval_sample_weight,
                      eval_init_score=eval_init_score,
                      **op.kwds)

            if op.model_type == LGBMModelType.RANKER or \
                    op.model_type == LGBMModelType.REGRESSOR:
                model.set_params(out_dtype_=np.dtype('float'))
            elif hasattr(label_val, 'dtype'):
                model.set_params(out_dtype_=label_val.dtype)
            else:
                model.set_params(out_dtype_=label_val.dtypes[0])

            ctx[op.outputs[0].key] = pickle.dumps(model)
        finally:
            _safe_call(_LIB.LGBM_NetworkFree())
Ejemplo n.º 6
0
    def execute(cls, ctx, op: "LGBMTrain"):
        if op.merge:
            return super().execute(ctx, op)

        from lightgbm.basic import _safe_call, _LIB

        data_val = ctx[op.data.key]
        data_val = data_val.spmatrix if hasattr(data_val,
                                                "spmatrix") else data_val

        label_val = ctx[op.label.key]
        sample_weight_val = (ctx[op.sample_weight.key]
                             if op.sample_weight is not None else None)
        init_score_val = ctx[
            op.init_score.key] if op.init_score is not None else None

        if op.eval_datas is None:
            eval_set, eval_sample_weight, eval_init_score = None, None, None
        else:
            eval_set, eval_sample_weight, eval_init_score = [], [], []
            for data, label in zip(op.eval_datas, op.eval_labels):
                data_eval = ctx[data.key]
                data_eval = (data_eval.spmatrix if hasattr(
                    data_eval, "spmatrix") else data_eval)
                eval_set.append((data_eval, ctx[label.key]))
            for weight in op.eval_sample_weights:
                eval_sample_weight.append(
                    ctx[weight.key] if weight is not None else None)
            for score in op.eval_init_scores:
                eval_init_score.append(
                    ctx[score.key] if score is not None else None)

            eval_set = eval_set or None
            eval_sample_weight = eval_sample_weight or None
            eval_init_score = eval_init_score or None

        params = op.params.copy()
        # if model is trained, remove unsupported parameters
        params.pop("out_dtype_", None)
        worker_ports = ctx[op.worker_ports.key]
        worker_ips = [worker.split(":", 1)[0] for worker in op.workers]
        worker_endpoints = [
            f"{worker}:{port}"
            for worker, port in zip(worker_ips, worker_ports)
        ]

        params["machines"] = ",".join(worker_endpoints)
        params["time_out"] = op.timeout
        params["num_machines"] = len(worker_endpoints)
        params["local_listen_port"] = worker_ports[op.worker_id]

        if (op.tree_learner
                or "").lower() not in {"data", "feature", "voting"}:
            logger.warning(
                "Parameter tree_learner not set or set to incorrect value "
                f'{op.tree_learner}, using "data" as default')
            params["tree_learner"] = "data"
        else:
            params["tree_learner"] = op.tree_learner

        try:
            model_cls = get_model_cls_from_type(op.model_type)
            model = model_cls(**params)
            model.fit(
                data_val,
                label_val,
                sample_weight=sample_weight_val,
                init_score=init_score_val,
                eval_set=eval_set,
                eval_sample_weight=eval_sample_weight,
                eval_init_score=eval_init_score,
                **op.kwds,
            )

            if (op.model_type == LGBMModelType.RANKER
                    or op.model_type == LGBMModelType.REGRESSOR):
                model.set_params(out_dtype_=np.dtype("float"))
            elif hasattr(label_val, "dtype"):
                model.set_params(out_dtype_=label_val.dtype)
            else:
                model.set_params(out_dtype_=label_val.dtypes[0])

            ctx[op.outputs[0].key] = pickle.dumps(model)
        finally:
            _safe_call(_LIB.LGBM_NetworkFree())
Ejemplo n.º 7
0
Archivo: train.py Proyecto: yecol/mars
    def execute(cls, ctx, op: "LGBMTrain"):
        if op.merge:
            return super().execute(ctx, op)

        from lightgbm.basic import _safe_call, _LIB

        data_val = ctx[op.data.key]
        label_val = ctx[op.label.key]
        sample_weight_val = ctx[
            op.sample_weight.key] if op.sample_weight is not None else None
        init_score_val = ctx[
            op.init_score.key] if op.init_score is not None else None

        if op.eval_datas is None:
            eval_set, eval_sample_weight, eval_init_score = None, None, None
        else:
            eval_set, eval_sample_weight, eval_init_score = [], [], []
            for data, label in zip(op.eval_datas, op.eval_labels):
                eval_set.append((ctx[data.key], ctx[label.key]))
            for weight in op.eval_sample_weights:
                eval_sample_weight.append(
                    ctx[weight.key] if weight is not None else None)
            for score in op.eval_init_scores:
                eval_init_score.append(
                    ctx[score.key] if score is not None else None)

            eval_set = eval_set or None
            eval_sample_weight = eval_sample_weight or None
            eval_init_score = eval_init_score or None

        params = op.params.copy()
        if ctx.running_mode == RunningMode.distributed:
            params['machines'] = ','.join(op.lgbm_endpoints)
            params['time_out'] = op.timeout
            params['num_machines'] = len(op.lgbm_endpoints)
            params['local_listen_port'] = op.lgbm_port

            if (op.tree_learner
                    or '').lower() not in {'data', 'feature', 'voting'}:
                logger.warning(
                    'Parameter tree_learner not set or set to incorrect value %s, '
                    'using "data" as default' % op.tree_learner)
                params['tree_learner'] = 'data'
            else:
                params['tree_learner'] = op.tree_learner

        try:
            model_cls = get_model_cls_from_type(op.model_type)
            model = model_cls(**params)
            model.fit(data_val,
                      label_val,
                      sample_weight=sample_weight_val,
                      init_score=init_score_val,
                      eval_set=eval_set,
                      eval_sample_weight=eval_sample_weight,
                      eval_init_score=eval_init_score,
                      **op.kwds)

            if hasattr(label_val, 'dtype'):
                model.set_params(out_dtype_=label_val.dtype)
            else:
                model.set_params(out_dtype_=label_val.dtypes[0])

            ctx[op.outputs[0].key] = pickle.dumps(model)
        finally:
            _safe_call(_LIB.LGBM_NetworkFree())