Ejemplo n.º 1
0
 def __init__(self):
     self.conf = Configuration()
     self.MLConf = MLConf()
Ejemplo n.º 2
0
class SGDLRLocalExample(object):

    def __init__(self):
        self.conf = Configuration()

    def set_conf(self):

        # Feature number of train data
        feature_num = 124
        # Total iteration number
        epoch_num = 20
        # Validation sample Ratio
        v_ratio = 0.1
        # Data format, libsvm or dummy
        data_fmt = "libsvm"
        # Train batch number per epoch.
        sp_ratio = 1.0
        # Batch number
        batch_num = 10

        # Learning rate
        learn_rate = 1.0
        # Decay of learning rate
        decay = 0.1
        # Regularization coefficient
        reg = 0.2

        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL")

        # Set basic configuration keys
        self.conf.set_boolean("mapred.mapper.new-api", True)
        self.conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, True)

        # Set data format
        self.conf.set(MLConf.ML_DATAFORMAT, data_fmt)

        # set angel resource parameters #worker, #task, #PS
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 1)

        # set sgd LR algorithm parameters #feature #epoch
        self.conf.set(MLConf.ML_FEATURE_NUM, str(feature_num))
        self.conf.set(MLConf.ML_EPOCH_NUM, str(epoch_num))
        self.conf.set(MLConf.ML_BATCH_SAMPLE_Ratio, str(sp_ratio))
        self.conf.set(MLConf.ML_VALIDATE_RATIO, str(v_ratio))
        self.conf.set(MLConf.ML_LEARN_RATE, str(learn_rate))
        self.conf.set(MLConf.ML_LEARN_DECAY, str(decay))
        self.conf.set(MLConf.ML_REG_L2, str(reg))

    def train_on_local_cluster(self):
        self.set_conf()
        input_path = "../data/exampledata/LRLocalExampleData/a9a.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        save_path = LOCAL_FS + TMP_PATH + "/model"
        log_path = LOCAL_FS + TMP_PATH + "/log"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, save_path)
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)


        runner = LRRunner()
        runner.train(self.conf)


    def inc_train(self):
        self.set_conf()
        input_path = "../data/exampledata/LRLocalExampleData/a9a.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        load_path = LOCAL_FS + TMP_PATH + "/model"
        save_path = LOCAL_FS + TMP_PATH + "/newmodel"
        log_path = LOCAL_FS + TMP_PATH + "/log"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set load model path
        self.conf.set(AngelConf.ANGEL_LOAD_MODEL_PATH, load_path)
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, save_path)
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        # Set actionType incremental train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_INC_TRAIN)

        runner = LRRunner()
        runner.inc_train(self.conf)


    def predict(self):
        self.set_conf()
        input_path = "../data/exampledata/LRLocalExampleData/a9a.test"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = System.getProperty("java.io.tmpdir", "/tmp")
        load_path = LOCAL_FS + TMP_PATH + "/model"
        save_path = LOCAL_FS + TMP_PATH + "/model"
        log_path = LOCAL_FS + TMP_PATH + "/log"
        predict_path = LOCAL_FS + TMP_PATH + "/predict"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set load model path
        self.conf.set(AngelConf.ANGEL_LOAD_MODEL_PATH, load_path)
        # Set predict result path
        self.conf.set(AngelConf.ANGEL_PREDICT_PATH, predict_path)
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        # Set actionType prediction
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_PREDICT())

        runner = LRRunner()

        runner.predict(self.conf)
Ejemplo n.º 3
0
 def __init__(self):
     self.conf = Configuration()
Ejemplo n.º 4
0
"""
An interactive shell.

This file is designed to be launched as a PYTHONSTARTUP script.
"""
import platform
import tempfile

from hadoop.local_fs import LocalFileSystem

from pyangel.context import Configuration
from pyangel.ml.conf import MLConf
from pyangel.conf import AngelConf

try:
    Configuration._init()
except RuntimeError:
    print("Oops!PyAngel failed to initialize")


# To Do
# Modify the way to get current Angel version


conf = Configuration()
conf[AngelConf.ANGEL_INPUTFORMAT_CLASS] = 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat'
conf.set_boolean("mapred.mapper.new-api", True)
conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, True)

LOCAL_FS = LocalFileSystem.DEFAULT_FS
TMP_PATH = tempfile.gettempdir()
Ejemplo n.º 5
0
class KmeansLocalExample(object):
    def __init__(self):
        self.conf = Configuration()
        self.MLConf = MLConf()

    def set_conf(self):
        data_fmt = "libsvm"

        # Cluster center number
        center_num = 3
        # Feature number of train data
        feature_num = 4
        # Total iteration number
        epoch_num = 20
        # Sample ratio per mini-batch
        sample_ratio = 1.0
        # C
        c = 0.15

        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL")

        # Set basic self.configuration key
        self.conf.set_boolean("mapred.mapper.new-api", True)
        self.conf.set(
            AngelConf.ANGEL_INPUTFORMAT_CLASS,
            'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST,
                              True)

        # Set angel resource parameters #worker, #task, #PS
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 1)

        # Set Kmeans algorithm parameters #cluster #feature #epoch
        self.conf.set(MLConf.KMEANS_CENTER_NUM, str(center_num))
        self.conf.set(MLConf.ML_FEATURE_NUM, str(feature_num))
        self.conf.set(MLConf.ML_EPOCH_NUM, str(epoch_num))
        self.conf.set(MLConf.KMEANS_SAMPLE_RATIO_PERBATCH, str(sample_ratio))
        self.conf.set(MLConf.kMEANS_C, str(c))

        # Set data format
        self.conf.set(MLConf.ML_DATAFORMAT, data_fmt)

    def train(self):
        self.set_conf()
        input_path = "data/exampledata/clusteringLocalExampleData/iris"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH,
                      LOCAL_FS + TMP_PATH + "/model")
        # Set log sava path
        self.conf.set(AngelConf.ANGEL_LOG_PATH,
                      LOCAL_FS + TMP_PATH + "/kmeansLog/log")
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)

        runner = KMeansRunner()
        runner.train(self.conf)

        angel_client = AngelClientFactory.get(self.conf)
        angel_client.stop()

    def predict_onLocal_cluster(self):
        self.set_conf()
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_PREDICT_DATA_PATH, input_path)
        # Set load model path
        self.conf.set(AngelConf.ANGEL_LOAD_MODEL_PATH,
                      LOCAL_FS + TMP_PATH + "/model")
        # Set predict result path
        self.conf.set(AngelConf.ANGEL_PREDICT_PATH,
                      LOCAL_FS + TMP_PATH + "/predict")
        # Set actionType prediction
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_PREDICT)

        runner = KMeansRunner()
        runner.predict(self.conf)

        angel_client = AngelClientFactory.get(self.conf)
        angel_client.stop()
Ejemplo n.º 6
0
class FMLocalExample(oject):
    def __init__(self):
        self.conf = Configuration()

    def set_conf(self):
        """
        set parameter values of self.conf
        """

        # Feature number of train data
        feature_num = 236
        # Total iteration number
        epoch_num = 20
        # Rank
        rank = 5
        # Regularization parameters
        reg0 = 0.0
        reg1 = 0.0
        reg2 = 0.001
        # Learn rage
        lr = 0.001
        stev = 0.1

        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL")

        # Set basic self.configuration keys
        self.conf.set_boolean("mapred.mapper.new-api", True)
        self.conf.set(
            AngelConf.ANGEL_INPUTFORMAT_CLASS,
            'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST,
                              True)

        #set angel resource parameters #worker, #task, #PS
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 1)

        #set FM algorithm parameters #feature #epoch
        self.conf.set(MLConf.ML_FEATURE_NUM, str(feature_num))
        self.conf.set(MLConf.ML_EPOCH_NUM, str(epoch_num))
        self.conf.set(MLConf.ML_FM_RANK, str(rank))
        self.conf.set(MLConf.ML_LEARN_RATE, str(lr))
        self.conf.set(MLConf.ML_FM_REG0, str(reg0))
        self.conf.set(MLConf.ML_FM_REG1, str(reg1))
        self.conf.set(MLConf.ML_FM_REG2, str(reg2))
        self.conf.set(MLConf.ML_FM_V_STDDEV, str(stev))

    def train_on_local_cluster(self):
        self.set_conf()
        input_path = "./src/test/data/fm/food_fm_libsvm"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        save_path = LOCAL_FS + TMP_PATH + "/model"
        log_path = LOCAL_FS + TMP_PATH + "/LRlog"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, save_path)
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN())

        runner = FMRunner()
        runner.train(self.conf)

        angel_client = AngelClientFactory.get(self.conf)
        angel_client.stop()

    def fm_classification(self):
        input_path = "./src/test/data/fm/a9a.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        save_path = LOCAL_FS + TMP_PATH + "/model"
        log_path = LOCAL_FS + TMP_PATH + "/LRlog"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, save_path)
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)
        # Set learnType
        self.conf.set(MLConf.ML_FM_LEARN_TYPE, "c")
        # Set feature number
        self.conf.set(MLConf.ML_FEATURE_NUM, str(124))

        runner = FMRunner()
        runner.train(self.conf)

        angel_client = AngelClientFactory.get(self.conf)
        angel_client.stop()
Ejemplo n.º 7
0
class LinearRegLocalExample(object):
    """
    Linear Regression Example used for user test, similar to "com.tencent.angel.example.ml.LinearRegLocalExample".
    """

    def __init__(self):
        self.conf= Configuration()

    def set_conf(self):
        """
        Set up self.configuration for runtime environment.
        """
        # Feature number of train data
        feature_num = 101
        # Total iteration number
        epoch_num = 20
        # Validation sample ratio
        v_ratio = 0.5
        # Data format,libsvm or dummy
        data_fmt = 'libsvm'
        # Train batch number per epoch
        sp_ratio = 1

        # Learning rate
        learn_rate = 0.1
        # Decay of learning rate
        decay = 0.01
        # Regularization coefficient
        reg = 0

        # Set job queue, if you use YARN deploy mode, you can set job queue by
        # self.conf.set('mapreduce.job.queue.name', 'default')

        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, 'LOCAL')

        # Set basic self.configuration keys
        self.conf.set_boolean('mapred.mapper.new-api', True)
        self.conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, True)

        # Set data format
        self.conf.set(MLConf.ML_DATAFORMAT, data_fmt)

        # set angel resource parameters #worker, #tast, #ps
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 2)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 10)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 2)

        # set sgd LR algorithim parameters # feature # epoch
        self.conf.set(MLConf.ML_FEATURE_NUM, str(feature_num))
        self.conf.set(MLConf.ML_EPOCH_NUM, str(epoch_num))
        self.conf.set(MLConf.ML_BATCH_SAMPLE_Ratio, str(sp_ratio))
        self.conf.set(MLConf.ML_VALIDATE_RATIO, str(v_ratio))
        self.conf.set(MLConf.ML_LEARN_RATE, str(learn_rate))
        self.conf.set(MLConf.ML_LEARN_DECAY, str(decay))
        self.conf.set(MLConf.ML_REG_L2, str(reg))


    def train_on_local_cluster(self):
        """
        Train model on local cluster
        """
        self.set_conf()
        input_path = '../data/exampledata/LinearRegression'
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        log_path = ".src/test/log"
        model_path = 'file:///tmp/angel/model'


        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, model_path)
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)
        self.conf.set("fs.defaultFS", LOCAL_FS + TMP_PATH)

        runner = LinearRegRunner()
        runner.train(self.conf)


    def inc_train(self):
        self.set_conf()
        input_path = "../data/exampledata/LinearRegression/LinearReg100.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        log_path = "./src/test/log"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, inputPath)
        # Set load model path
        self.conf.set(AngelConf.ANGEL_LOAD_MODEL_PATH, LOCAL_FS + TMP_PATH + "/model")
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/newmodel")
        # Set actionType incremental train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_INC_TRAIN())
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, logPath)

        runner = LinearRegRunner()
        runner.incTrain(self.conf)

    def predict(self):
        self.set_conf()
        input_path = "../data/exampledata/LinearRegression/LinearReg100.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, inputPath)
        # Set load model path
        self.conf.set(AngelConf.ANGEL_LOAD_MODEL_PATH, LOCAL_FS + TMP_PATH + "/model")
        # Set predict result path
        self.conf.set(AngelConf.ANGEL_PREDICT_PATH, LOCAL_FS + TMP_PATH + "/predict")
        # Set actionType prediction
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_PREDICT())
        runner = LinearRegRunner()

        runner.predict(self.conf)
Ejemplo n.º 8
0
class MFLocalExample(object):
    def __init__(self):
        self.conf = Configuration()
        self.MLConf = MLConf()

    def set_conf(self):
        inputPath = "../../data/exampledata/MFLocalExampleData"
        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL")
        # Set basic self.configuration keys
        self.conf.set_boolean("mapred.mapper.new-api", True)
        self.conf.set(
            AngelConf.ANGEL_INPUTFORMAT_CLASS,
            'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST,
                              True)

        # set angel resource parameters #worker, #task, #PS
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 1)

        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()

        # Set trainning data, and save model path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, inputPath)
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH,
                      LOCAL_FS + TMP_PATH + "/model")
        self.conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log")
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)

        # Set MF algorithm parameters
        self.conf.set(MLConf.ML_MF_RANK, "200")
        self.conf.set(MLConf.ML_EPOCH_NUM, "8")
        self.conf.set(MLConf.ML_MF_ROW_BATCH_NUM, "2")
        self.conf.set(MLConf.ML_MF_ITEM_NUM, "1683")
        self.conf.set(MLConf.ML_MF_LAMBDA, "0.01")
        self.conf.set(MLConf.ML_MF_ETA, "0.0054")

    def train(self):
        self.set_conf()
        runner = MatrixFactorizationRunner()
        runner.train(self.conf)
Ejemplo n.º 9
0
class GBDTExample(object):

    def __init__(self):
        self.conf = Configuration()

    def set_conf(self):
        """
        Input Path, please modify ${YOUR_ANGEL_HOME} as your local angel installation path,
        e.g. if your path is /home/angel/angel_1.3.0, your input_path should be:
        "file:///home/angel/angel_1.3.0/data/exampledata/GBDTLocalExampleData/agaricus.txt.train",
        and your out_path could be: "file:///home/angel/angel_1.3.0/data/output"
        :return:
        """

        cate_feat = "0:2,1:2,2:2,3:2,4:2,5:2,6:2,7:2,8:2,9:2,10:2,11:2,12:2,13:2,14:2,15:2,16:2,17:2,18:2,19:2,20:2," \
            "21:2,22:2,23:2,24:2,25:2,26:2,27:2,28:2,29:2,30:2,31:2,32:2,33:2,34:2,35:2,36:2,37:2,38:2,39:2,40:2," \
            "41:2,42:2,43:2,44:2,45:2,46:2,47:2,48:2,49:2,50:2,51:2,52:2,53:2,54:2,55:2,56:2,57:2,58:2,59:2,60:2," \
            "61:2,62:2,63:2,64:2,65:2,66:2,67:2,68:2,69:2,70:2,71:2,72:2,73:2,74:2,75:2,76:2,77:2,78:2,79:2,80:2," \
            "81:2,82:2,83:2,84:2,85:2,86:2,87:2,88:2,89:2,90:2,91:2,92:2,93:2,94:2,95:2,96:2,97:2,98:2,99:2,100:2," \
            "101:2,102:2,103:2,104:2,105:2,106:2,107:2,108:2,109:2,110:2,111:2,112:2,113:2,114:2,115:2,116:2,117:2," \
            "118:2,119:2,120:2,121:2,122:2,123:2,124:2,125:2,126:2"
        params = {
            AngelConf.ANGEL_DEPLOY_MODE: 'LOCAL',
            'mapred.mapper.new-api': True,
            AngelConf.ANGEL_INPUTFORMAT_CLASS: 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat',
            AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST: True,
            AngelConf.ANGEL_WORKERGROUP_NUMBER: 1,
            AngelConf.ANGEL_WORKER_TASK_NUMBER: 1,
            AngelConf.ANGEL_PS_NUMBER: 1,
            MLConf.ML_DATA_FORMAT: 'libsvm',
            MLConf.ML_FEATURE_NUM: 127,
            MLConf.ML_FEATURE_NNZ: 25,
            MLConf.ML_GBDT_TREE_NUM: 2,
            MLConf.ML_GBDT_TREE_DEPTH: 2,
            MLConf.ML_GBDT_SPLIT_NUM: 10,
            MLConf.ML_GBDT_SAMPLE_RATIO: 1.0,
            MLConf.ML_LEARN_RATE: 0.01,
            MLConf.ML_GBDT_CATE_FEAT: cate_feat
        }

        self.conf.update(params)

    def train(self):
        self.set_conf()

        local_fs = LocalFileSystem.DEFAULT_FS
        tmp_path = tempfile.gettempdir()
        save_path = local_fs + tmp_path + "/model"
        log_path = local_fs + tmp_path + "/GBDTlog"
        input_path = "data/exampledata/GBDTLocalExampleData/agaricus.txt.train"
        output_path = "data/output"

        self.conf[AngelConf.ANGEL_TRAIN_DATA_PATH] = input_path
        self.conf[AngelConf.ANGEL_SAVE_MODEL_PATH] = output_path

        self.conf[AngelConf.ANGEL_SAVE_MODEL_PATH] = save_path
        # Set log path
        self.conf[AngelConf.ANGEL_LOG_PATH] = log_path
        # Set actionType train
        self.conf[AngelConf.ANGEL_ACTION_TYPE] = MLConf.ANGEL_ML_TRAIN

        runner = GBDTRunner()
        runner.train(self.conf)

    def predict(self):
        self.set_conf()
        # Load Model from HDFS.
        tmp_path = tempfile.gettempdir()
        self.conf["gbdt.split.feature"] = tmp_path + "/out/xxx"
        self.conf["gbdt.split.value"] = tmp_path + "/out/xxx"

        runner = GBDTRunner()

        runner.predict(self.conf)
Ejemplo n.º 10
0
class FMLocalExample(oject):

    def __init__(self):
        self.conf = Configuration()

    def set_conf(self):
        """
        set parameter values of self.conf
        """

        # Feature number of train data
        feature_num = 236
        # Total iteration number
        epoch_num = 20
        # Rank
        rank = 5
        # Regularization parameters
        reg0 = 0.0
        reg1 = 0.0
        reg2 = 0.001
        # Learn rage
        lr = 0.001
        stev = 0.1

        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL")

        # Set basic self.configuration keys
        self.conf.set_boolean("mapred.mapper.new-api", True)
        self.conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, True)

        #set angel resource parameters #worker, #task, #PS
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 1)

        #set FM algorithm parameters #feature #epoch
        self.conf.set(MLConf.ML_FEATURE_NUM, str(feature_num))
        self.conf.set(MLConf.ML_EPOCH_NUM, str(epoch_num))
        self.conf.set(MLConf.ML_FM_RANK, str(rank))
        self.conf.set(MLConf.ML_LEARN_RATE, str(lr))
        self.conf.set(MLConf.ML_FM_REG0, str(reg0))
        self.conf.set(MLConf.ML_FM_REG1, str(reg1))
        self.conf.set(MLConf.ML_FM_REG2, str(reg2))
        self.conf.set(MLConf.ML_FM_V_STDDEV, str(stev))

    def train_on_local_cluster(self):
        self.set_conf()
        input_path = "./src/test/data/fm/food_fm_libsvm"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        save_path = LOCAL_FS + TMP_PATH + "/model"
        log_path = LOCAL_FS + TMP_PATH + "/LRlog"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, save_path)
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN())

        runner = FMRunner()
        runner.train(self.conf)

        angel_client = AngelClientFactory.get(self.conf)
        angel_client.stop()

    def fm_classification(self):
        input_path = "./src/test/data/fm/a9a.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        save_path = LOCAL_FS + TMP_PATH + "/model"
        log_path = LOCAL_FS + TMP_PATH + "/LRlog"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, save_path)
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)
        # Set learnType
        self.conf.set(MLConf.ML_FM_LEARN_TYPE, "c")
        # Set feature number
        self.conf.set(MLConf.ML_FEATURE_NUM, str(124))

        runner = FMRunner()
        runner.train(self.conf)

        angel_client = AngelClientFactory.get(self.conf)
        angel_client.stop()
Ejemplo n.º 11
0
class LinearRegLocalExample(object):
    """
    Linear Regression Example used for user test, similar to "com.tencent.angel.example.ml.LinearRegLocalExample".
    """
    def __init__(self):
        self.conf = Configuration()

    def set_conf(self):
        """
        Set up self.configuration for runtime environment.
        """
        # Feature number of train data
        feature_num = 101
        # Total iteration number
        epoch_num = 20
        # Validation sample ratio
        v_ratio = 0.5
        # Data format,libsvm or dummy
        data_fmt = 'libsvm'
        # Train batch number per epoch
        sp_ratio = 1

        # Learning rate
        learn_rate = 0.1
        # Decay of learning rate
        decay = 0.01
        # Regularization coefficient
        reg = 0

        # Set job queue, if you use YARN deploy mode, you can set job queue by
        # self.conf.set('mapreduce.job.queue.name', 'default')

        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, 'LOCAL')

        # Set basic self.configuration keys
        self.conf.set_boolean('mapred.mapper.new-api', True)
        self.conf.set(
            AngelConf.ANGEL_INPUTFORMAT_CLASS,
            'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST,
                              True)

        # Set data format
        self.conf.set(MLConf.ML_DATAFORMAT, data_fmt)

        # set angel resource parameters #worker, #tast, #ps
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 2)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 10)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 2)

        # set sgd LR algorithim parameters # feature # epoch
        self.conf.set(MLConf.ML_FEATURE_NUM, str(feature_num))
        self.conf.set(MLConf.ML_EPOCH_NUM, str(epoch_num))
        self.conf.set(MLConf.ML_BATCH_SAMPLE_Ratio, str(sp_ratio))
        self.conf.set(MLConf.ML_VALIDATE_RATIO, str(v_ratio))
        self.conf.set(MLConf.ML_LEARN_RATE, str(learn_rate))
        self.conf.set(MLConf.ML_LEARN_DECAY, str(decay))
        self.conf.set(MLConf.ML_REG_L2, str(reg))

    def train_on_local_cluster(self):
        """
        Train model on local cluster
        """
        self.set_conf()
        input_path = '../data/exampledata/LinearRegression'
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        log_path = ".src/test/log"
        model_path = 'file:///tmp/angel/model'

        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, input_path)
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, model_path)
        self.conf.set(AngelConf.ANGEL_LOG_PATH, log_path)
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)
        self.conf.set("fs.defaultFS", LOCAL_FS + TMP_PATH)

        runner = LinearRegRunner()
        runner.train(self.conf)

    def inc_train(self):
        self.set_conf()
        input_path = "../data/exampledata/LinearRegression/LinearReg100.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        log_path = "./src/test/log"

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, inputPath)
        # Set load model path
        self.conf.set(AngelConf.ANGEL_LOAD_MODEL_PATH,
                      LOCAL_FS + TMP_PATH + "/model")
        # Set save model path
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH,
                      LOCAL_FS + TMP_PATH + "/newmodel")
        # Set actionType incremental train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_INC_TRAIN())
        # Set log path
        self.conf.set(AngelConf.ANGEL_LOG_PATH, logPath)

        runner = LinearRegRunner()
        runner.incTrain(self.conf)

    def predict(self):
        self.set_conf()
        input_path = "../data/exampledata/LinearRegression/LinearReg100.train"
        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()

        # Set trainning data path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, inputPath)
        # Set load model path
        self.conf.set(AngelConf.ANGEL_LOAD_MODEL_PATH,
                      LOCAL_FS + TMP_PATH + "/model")
        # Set predict result path
        self.conf.set(AngelConf.ANGEL_PREDICT_PATH,
                      LOCAL_FS + TMP_PATH + "/predict")
        # Set actionType prediction
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_PREDICT())
        runner = LinearRegRunner()

        runner.predict(self.conf)
Ejemplo n.º 12
0
 def __init__(self):
     self.conf= Configuration()
     self.MLConf = MLConf()
Ejemplo n.º 13
0
class MFLocalExample(object):

    def __init__(self):
        self.conf= Configuration()
        self.MLConf = MLConf()

    def set_conf(self):
        inputPath = "../../data/exampledata/MFLocalExampleData"
        # Set local deploy mode
        self.conf.set(AngelConf.ANGEL_DEPLOY_MODE, "LOCAL")
        # Set basic self.configuration keys
        self.conf.set_boolean("mapred.mapper.new-api", True)
        self.conf.set(AngelConf.ANGEL_INPUTFORMAT_CLASS, 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat')
        self.conf.set_boolean(AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST, True)

        # set angel resource parameters #worker, #task, #PS
        self.conf.set_int(AngelConf.ANGEL_WORKERGROUP_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_WORKER_TASK_NUMBER, 1)
        self.conf.set_int(AngelConf.ANGEL_PS_NUMBER, 1)

        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()

        # Set trainning data, and save model path
        self.conf.set(AngelConf.ANGEL_TRAIN_DATA_PATH, inputPath)
        self.conf.set(AngelConf.ANGEL_SAVE_MODEL_PATH, LOCAL_FS + TMP_PATH + "/model")
        self.conf.set(AngelConf.ANGEL_LOG_PATH, LOCAL_FS + TMP_PATH + "/log")
        # Set actionType train
        self.conf.set(AngelConf.ANGEL_ACTION_TYPE, MLConf.ANGEL_ML_TRAIN)

        # Set MF algorithm parameters
        self.conf.set(MLConf.ML_MF_RANK, "200")
        self.conf.set(MLConf.ML_EPOCH_NUM, "8")
        self.conf.set(MLConf.ML_MF_ROW_BATCH_NUM, "2")
        self.conf.set(MLConf.ML_MF_ITEM_NUM, "1683")
        self.conf.set(MLConf.ML_MF_LAMBDA, "0.01")
        self.conf.set(MLConf.ML_MF_ETA, "0.0054")


    def train(self):
        self.set_conf()
        runner = MatrixFactorizationRunner()
        runner.train(self.conf)
Ejemplo n.º 14
0
class GBDTExample(object):
    def __init__(self):
        self.conf = Configuration()

    def set_conf(self):
        """
        Input Path, please modify ${YOUR_ANGEL_HOME} as your local angel installation path,
        e.g. if your path is /home/angel/angel_1.3.0, your input_path should be:
        "file:///home/angel/angel_1.3.0/data/exampledata/GBDTLocalExampleData/agaricus.txt.train",
        and your out_path could be: "file:///home/angel/angel_1.3.0/data/output"
        if you need, you can delete the annotation mark before Line35,Line36,Line61,Line62, so
        there is no need for you to pass the configs every time you submit the pyangel job.
        :return:
        """
        # Feature number of train data
        feature_num = 127
        # Number of nonzero features
        feature_nzz = 25
        # Tree number
        tree_num = 2
        # Tree depth
        tree_depth = 2
        # Split number
        split_num = 10
        # Feature sample ratio
        sample_ratio = 1.0

        # Data format
        data_fmt = "libsvm"

        # Learning rate
        learn_rate = 0.01

        # Use local deploy mode and dummy data spliter
        self.conf[AngelConf.ANGEL_DEPLOY_MODE] = "LOCAL"

        self.conf['mapred.mapper.new-api'] = True
        self.conf[
            AngelConf.
            ANGEL_INPUTFORMAT_CLASS] = 'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat'
        self.conf[AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST] = True

        # Set angel resource parameters #worker, #task, #PS
        self.conf[AngelConf.ANGEL_WORKERGROUP_NUMBER] = 1
        self.conf[AngelConf.ANGEL_WORKER_TASK_NUMBER] = 1
        self.conf[AngelConf.ANGEL_PS_NUMBER] = 1

        # Set GBDT algorithm parameters
        self.conf[MLConf.ML_DATA_FORMAT] = data_fmt
        self.conf[MLConf.ML_FEATURE_NUM] = str(feature_num)
        self.conf[MLConf.ML_FEATURE_NNZ] = str(feature_nzz)
        self.conf[MLConf.ML_GBDT_TREE_NUM] = str(tree_num)
        self.conf[MLConf.ML_GBDT_TREE_DEPTH] = str(tree_depth)
        self.conf[MLConf.ML_GBDT_SPLIT_NUM] = str(split_num)
        self.conf[MLConf.ML_GBDT_SAMPLE_RATIO] = str(sample_ratio)
        self.conf[MLConf.ML_LEARN_RATE] = str(learn_rate)

        params = {
            AngelConf.ANGEL_DEPLOY_MODE: 'LOCAL',
            'mapred.mapper.new-api': True,
            AngelConf.ANGEL_INPUTFORMAT_CLASS:
            'org.apache.hadoop.mapreduce.lib.input.CombineTextInputFormat',
            AngelConf.ANGEL_JOB_OUTPUT_PATH_DELETEONEXIST: True,
            AngelConf.ANGEL_WORKERGROUP_NUMBER: 1,
            AngelConf.ANGEL_WORKER_TASK_NUMBER: 1,
            AngelConf.ANGEL_PS_NUMBER: 1,
            MLConf.ML_DATA_FORMAT: 'libsvm',
            MLConf.ML_FEATURE_NUM: 127,
            MLConf.ML_FEATURE_NNZ: 25,
            MLConf.ML_GBDT_TREE_NUM: 2,
            MLConf.ML_GBDT_TREE_DEPTH: 2,
            MLConf.ML_GBDT_SPLIT_NUM: 10,
            MLConf.ML_GBDT_SAMPLE_RATIO: 1.0,
            MLConf.ML_LEARN_RATE: 0.01
        }

        self.conf.load(params)

    def train(self):
        self.set_conf()

        LOCAL_FS = LocalFileSystem.DEFAULT_FS
        TMP_PATH = tempfile.gettempdir()
        save_path = LOCAL_FS + TMP_PATH + "/model"
        log_path = LOCAL_FS + TMP_PATH + "/GBDTlog"
        input_path = "data/exampledata/GBDTLocalExampleData/agaricus.txt.train"
        output_path = "data/output"

        self.conf[AngelConf.ANGEL_TRAIN_DATA_PATH] = input_path
        self.conf[AngelConf.ANGEL_SAVE_MODEL_PATH] = output_path

        self.conf[AngelConf.ANGEL_SAVE_MODEL_PATH] = save_path
        # Set log path
        self.conf[AngelConf.ANGEL_LOG_PATH] = log_path
        # Set actionType train
        self.conf[AngelConf.ANGEL_ACTION_TYPE] = MLConf.ANGEL_ML_TRAIN

        runner = GBDTRunner()
        runner.train(self.conf)

    def predict(self):
        self.set_conf()
        # Load Model from HDFS.
        tmp_path = tempfile.gettempdir()
        self.conf["gbdt.split.feature"] = tmp_path + "/out/xxx"
        self.conf["gbdt.split.value"] = tmp_path + "/out/xxx"

        runner = GBDTRunner()

        runner.predict(conf)