Example #1
0
    def __init__(self, window = 50, datasource = 'local', preprocesses = ['None'], datadir = None, reward = 'ROI',
                 use_market_profile = False):
        self.window = window
        self.reward_meth = reward
        #self.action_space = spaces.Discrete(3)
        self.action_space = dict(type = 'int', shape = 1, num_actions = 3, min_value = 0, max_value = 2)
        self.use_market_profile = use_market_profile
        self.preprocesses = preprocesses
        self.fig = None

        '''FIXME: is this correct? '''
        self.commission = 0.1 / 100

        if datasource == 'local':
            if datadir != datadir: # check for Nonetype
                raise ValueError('Error: please specify data directory.')
            else:
                self.data = self.load_normal(datadir = datadir)
        elif datasource == 'robinhood':
            return NotImplementedError
        elif datasource == 'iex':
            return NotImplementedError

        for preprocess in preprocesses:
            if preprocess == 'None':
                pass
            elif preprocess == 'MinMax':            # normalized 0 to 1, wouldn't recommend
                self.data = self.preprocess_MinMax()
            elif preprocess == 'renko':             # blocks
                self.data = self.preprocess_renko()
            elif preprocess == 'log_transform':     # log return values
                self.data = self.preprocess_log_transform()
            elif preprocess == 'autoencode':
                self.data = self.preprocess_autoencode()
        
        if self.observation_space is None: # not yet set by preprocessing
            self.observation_space = dict(type = 'float', shape = [self.window, self.data.shape[1]])
            #self.observation_space = spaces.Box(low = 0, high = 10000, shape = (self.window, self.data.shape[1]))
        
        # data should be loaded and processed
        print_data_info(self.data)
log_device_placement = False  # log placement of operations on devices

# Data Preparation
# ==================================================

train, test = data.load_dataset(args.dataset,
                                out=args.out,
                                vocab_size=args.vocab_size)

x_train = train.data.astype(np.float32)
x_test = test.data.astype(np.float32)
y_train = train.labels
y_test = test.labels

# Print information about the dataset
utils.print_data_info(train, x_train, x_test, y_train, y_test)

# To print for results.csv
data_str = "{{format: '{}', vocab_size: {}}}".format(args.out,
                                                     len(train.vocab))

# Training
# ==================================================

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=allow_soft_placement,
                                  log_device_placement=log_device_placement)
    sess = tf.Session(config=session_conf)
    with sess.as_default():
        mlp = MLP(vocab_size=len(train.vocab),
                  num_classes=len(train.class_names),
    # data split
    MAX_LEN = 25  # NOTE: we filter out a lot of sentences for speed
    train_data, valid_data, test_data = datasets.IWSLT.splits(
        exts=('.en', '.de'),
        fields=(SRC, TRG),
        filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and len(
            vars(x)['trg']) <= MAX_LEN)

    MIN_FREQ = 5  # NOTE: we limit the vocabulary to frequent words for speed
    SRC.build_vocab(train_data.src, min_freq=MIN_FREQ)
    TRG.build_vocab(train_data.trg, min_freq=MIN_FREQ)

    PAD_INDEX = TRG.vocab.stoi[PAD_TOKEN]

    # print data info
    print_data_info(train_data, valid_data, test_data, SRC, TRG)

    # define iterator
    train_iter = data.BucketIterator(train_data,
                                     batch_size=params['batch_size'],
                                     train=True,
                                     sort_within_batch=True,
                                     sort_key=lambda x:
                                     (len(x.src), len(x.trg)),
                                     repeat=False,
                                     device=DEVICE)
    valid_iter = data.Iterator(valid_data,
                               batch_size=1,
                               train=False,
                               sort=False,
                               repeat=False,
    # define iterator
    train_iter = data.BucketIterator(train_data,
                                     batch_size=params['BATCH_SIZE'],
                                     device=DEVICE,
                                     sort_within_batch=True,
                                     sort_key=lambda x: len(x.text),
                                     train=True,
                                     repeat=False)

    # train_iter = data.Iterator(train_data, batch_size=1, train=False, sort=False, repeat=False, device=DEVICE)

    valid_iter = data.Iterator(valid_data,
                               batch_size=1,
                               train=False,
                               sort=False,
                               repeat=False,
                               device=DEVICE)

    test_iter = data.Iterator(test_data,
                              batch_size=1,
                              train=False,
                              sort=False,
                              repeat=False,
                              device=DEVICE)

    print_data_info(train_data, valid_data, test_data, SRC, LABEL)

    #############################

    run_lrp(test_iter, vocab=SRC.vocab, model_file='sa_model4.pt')
    def __init__(self, data_info, time_info):

        # Print data information
        info_dict = extract(data_info, time_info)
        print_data_info(info_dict)

        # # Install hyperopt and lightgbm
        # pip_install('hyperopt')
        # pip_install('lightgbm')

        print('Using algo: {}'.format(params['algo']))

        # Settings
        if params['algo'] == Algo.ORIGINAL:
            self._dataset_budget_threshold = 0.8
            self._max_train_data = 200000
            self.batch_size = 50000
            self.delta_n_estimators = 100
            self.delta_num_leaves = 20
            self.delta_learning_rate = 0.005
            self.delta_max_depth = 1
            self.delta_feature_fraction = 0.1
            self.delta_bagging_fraction = 0.1
            self.delta_bagging_freq = 1
            self.max_evaluation = 30
            self.param_choice_fixed = {
                'n_estimators': 400,
                'learning_rate': 0.01,
                'num_leaves': 50,
                'feature_fraction': 0.6,
                'bagging_fraction': 0.6,
                'bagging_freq': 2,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc'
            }
        elif params['algo'] == Algo.FACEBOOK_LR:
            self._dataset_budget_threshold = 0.8
            self._max_train_data = 100000
            self.batch_size = 25000
            self.delta_n_estimators = 50
            self.delta_num_leaves = 10
            self.delta_learning_rate = 0.005
            self.delta_max_depth = 1
            self.delta_feature_fraction = 0.1
            self.delta_bagging_fraction = 0.1
            self.delta_bagging_freq = 1
            self.max_evaluation = 30
            self.param_choice_fixed = {
                'n_estimators': 75,
                'learning_rate': 0.01,
                'num_leaves': 15,
                'feature_fraction': 0.6,
                'bagging_fraction': 0.6,
                'bagging_freq': 2,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc'
            }
        elif params['algo'] == Algo.BASIC:
            self._dataset_budget_threshold = 0.8
            self._max_train_data = 100000
            self.batch_size = 25000
            self.delta_n_estimators = 50
            self.delta_num_leaves = 10
            self.delta_learning_rate = 0.005
            self.delta_max_depth = 1
            self.delta_feature_fraction = 0.1
            self.delta_bagging_fraction = 0.1
            self.delta_bagging_freq = 1
            self.max_evaluation = 30
            self.param_choice_fixed = {
                'n_estimators': 75,
                'learning_rate': 0.01,
                'num_leaves': 15,
                'feature_fraction': 0.6,
                'bagging_fraction': 0.6,
                'bagging_freq': 2,
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc'
            }

        self._train_data = np.array([])
        self._train_labels = np.array([])
        self._transformed_train_data = np.array([])
        self.best_hyperparams = {}
        self._classifier = None
        self._classifier2 = None
        self._data_processor = DataProcessor(info_dict)
        self._sampler = Sampler()

        self.mdl = StreamSaveRetrainPredictor()