def __init__(self):
        self.config = json.load(open("config.json", "r", encoding="utf-8"))
        self.dates = np.load(self.config["date_list"])
        self.att_cover_len = self.config["attention_cover_daynum"]
        self.long_term_seq_len = self.config["long_term_sequence_len"]
        self.short_term_seq_len = self.config["short_term_sequence_len"]
        self.cnn_nbhd_size = self.config["cnn_neighbor_size"]
        self.history_feature_daynum = self.config["history_feature_daynum"]
        self.forecast_ahead_daynum = self.config["forecast_ahead_daynum"]

        self.batch_size = self.config["batch_size"]
        self.epochs = self.config["epochs"]
        self.patience = self.config["patience_earlystop"]
        # early_stop = CustomStopper(patience=patience)

        self.sampler = FeatureFactory(self.config["volume_train"],
                                      self.config["flow_train"])
        self.modeler = STDNModel()

        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s - %(levelname)s - %(message)s",
                            datefmt="%Y-%m-%d %H:%M:%S")
Esempio n. 2
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import sklearn as sk
import collections as col
import operator
from features import FeatureFactory
from random import shuffle


ff = FeatureFactory()
features = ff.getNames()
print 'len features = ' + str(len(features))

currencies = [
    'AUDUSD',
    'EURGBP',
    'EURJPY',
    'EURUSD',
    'GBPJPY',
    'GBPUSD',
    'NZDUSD',
    'USDCAD',
    'USDCHF',
    'USDJPY',
]
shuffle(currencies)
Esempio n. 3
0
        names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'],
    )
    data = df.as_matrix()

    # train on first 60%
    data = data[:int(len(data) * 0.60)]

    opens = data[:, 2].astype(float)
    highs = data[:, 3].astype(float)
    lows = data[:, 4].astype(float)
    closes = data[:, 5].astype(float)
    volumes = data[:, 6].astype(int)

    # calculating features
    print 'calculating features...'
    ff = FeatureFactory()
    X_scaled = ff.getFeatures(opens, highs, lows, closes, volumes)
    # print X_scaled

    # set rewards
    print 'calculating rewards...'
    rewards = ff.getRewards(closes)

    # fitting regressor
    # rfc = RandomForestClassifier(n_estimators=30)
    rfc = ExtraTreesClassifier(
        # n_estimators=30,
        # max_features='sqrt'
    )

    # predict
Esempio n. 4
0
import pandas as pd
import numpy as np
import sklearn as sk
from features import FeatureFactory
from random import choice, random
import time


ff = FeatureFactory()

currencies = [
    'AUDUSD',
    'EURGBP',
    'EURJPY',
    'EURUSD',
    'GBPJPY',
    'GBPUSD',
    'NZDUSD',
    'USDCAD',
    'USDCHF',
    'USDJPY',
]

actions = ['short', 'long']

def getReward(closes, start, end, action):
    closeStart = closes[start]
    closeEnd = closes[end]
    if action == 'short':
        reward = closeStart - closeEnd
    elif action == 'long':
Esempio n. 5
0
    if random() < epsilon:
        a = choice(actionsAvailable)
    # exploitation
    else:
        aMax = None
        QsaHighest = -1000
        for a in actionsAvailable:
            Qsa = getActionStateValue(thetas, features[a], a)
            if Qsa > QsaHighest:
                QsaHighest = Qsa
                aMax = a
        a = aMax
    return a


ff = FeatureFactory()
alpha = 0.1
epsilon = 0.1
gamma = 0.9
if __name__ == '__main__':
    interval = '1440'
    # interval = choice(intervals)
    for currency in currencies:
        print '\n', currency, interval

        # load data
        opens, highs, lows, closes, volumes = loadData(currency, interval)
        print 'data loaded'
        dataSize = len(closes)

        # extract features
Esempio n. 6
0
        names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'],
    )
    data = df.as_matrix()

    # train on first 60%
    data = data[:int(len(data) * 0.60)]

    opens = data[:, 2].astype(float)
    highs = data[:, 3].astype(float)
    lows = data[:, 4].astype(float)
    closes = data[:, 5].astype(float)
    volumes = data[:, 6].astype(int)

    # calculating features
    print 'calculating features...'
    ff = FeatureFactory()
    X_scaled = ff.getFeatures(opens, highs, lows, closes, volumes)
    # print X_scaled

    # set rewards
    print 'calculating rewards...'
    rewards = ff.getRewards(closes)

    # fitting regressor
    # rfc = RandomForestClassifier(n_estimators=30)
    rfc = ExtraTreesClassifier(
        # n_estimators=30,
        # max_features='sqrt'
    )

    # predict
class STDNTrainer:
    def __init__(self):
        self.config = json.load(open("config.json", "r", encoding="utf-8"))
        self.dates = np.load(self.config["date_list"])
        self.att_cover_len = self.config["attention_cover_daynum"]
        self.long_term_seq_len = self.config["long_term_sequence_len"]
        self.short_term_seq_len = self.config["short_term_sequence_len"]
        self.cnn_nbhd_size = self.config["cnn_neighbor_size"]
        self.history_feature_daynum = self.config["history_feature_daynum"]
        self.forecast_ahead_daynum = self.config["forecast_ahead_daynum"]

        self.batch_size = self.config["batch_size"]
        self.epochs = self.config["epochs"]
        self.patience = self.config["patience_earlystop"]
        # early_stop = CustomStopper(patience=patience)

        self.sampler = FeatureFactory(self.config["volume_train"],
                                      self.config["flow_train"])
        self.modeler = STDNModel()

        logging.basicConfig(level=logging.INFO,
                            format="%(asctime)s - %(levelname)s - %(message)s",
                            datefmt="%Y-%m-%d %H:%M:%S")

    def rmsle(self, y_true, y_pred):
        log_true = K.log(y_true + 1)
        log_pred = K.log(y_pred + 1)
        return K.sqrt(K.mean(K.square(log_pred - log_true), axis=-1))

    def inverse_rmsle(self, x, n_sample):
        return np.power(x, 2) * n_sample

    def memory_monitor(self, ps):
        mem_used = ps.memory_full_info().uss / 1024 / 1024 / 1024
        logging.warning("Memory used: {:.2f} GB".format(mem_used))

    def memory_monitor_v2(self):
        """ Linux服务器可能无法安装psutil """
        with open('/proc/meminfo') as f:
            mem_total = int(f.readline().split()[1])
            mem_free = int(f.readline().split()[1])
            _ = f.readline()  # 跳过一行
            mem_buffer = int(f.readline().split()[1])
            mem_cache = int(f.readline().split()[1])
        mem_used = mem_total - mem_free - mem_buffer - mem_cache
        mem_used = mem_used / 1024 / 1024 / 1024
        logging.warning("Memory used: {:.2f} GB".format(mem_used))

    def gen_input_to_model(self, att_flow_inputs, att_lstm_ex_inputs,
                           flow_inputs, lstm_ex_inputs):
        final_input = {}
        valid_long_term_seq_len = self.long_term_seq_len - int(
            (self.att_cover_len - 1) / 2)
        for ts in range(valid_long_term_seq_len):
            final_input["att_lstm_ex_input_{0}".format(
                ts)] = att_lstm_ex_inputs[:, ts]
            for att in range(self.att_cover_len):
                final_input["att_flow_volume_input_{0}_{1}".format(
                    ts, att)] = att_flow_inputs[:, ts, att]

        for ts in range(self.short_term_seq_len):
            final_input["flow_volume_input_{0}".format(ts)] = flow_inputs[:,
                                                                          ts]

        final_input["lstm_ex_input"] = lstm_ex_inputs

        return final_input

    def train(self):

        model = self.modeler.build_graph(
            graph_args=self.config,
            long_term_seq_len=self.long_term_seq_len,
            att_cover_len=self.att_cover_len,
            short_term_seq_len=self.short_term_seq_len,
            nbhd_size=2 * self.cnn_nbhd_size + 1,
            output_shape=3 * self.forecast_ahead_daynum)

        init_start = self.long_term_seq_len + 1  # 最前面的序列要用来取特征
        losses = []
        best_loss = 0
        no_improve = 0
        stop_flag = False
        batch_start_idx = int(init_start)
        for ep in range(self.epochs):
            logging.info("********** Epoch %d Begins **********" % (ep + 1))
            epoch_end = False
            loss_epoch = []
            step = 0
            while not epoch_end:
                batch_end_idx = int(batch_start_idx + self.batch_size +
                                    self.forecast_ahead_daynum)
                if batch_end_idx >= len(self.dates) - 1:
                    n_dates_batch = len(self.dates[batch_start_idx:])
                    epoch_end = True
                else:
                    n_dates_batch = batch_end_idx - batch_start_idx

                logging.info("Sampling from date {0} to {1} ...".format(
                    self.dates[batch_start_idx],
                    self.dates[batch_start_idx + n_dates_batch]))

                x_att_flow, x_att_lstm_ex, x_flow, x_short_term, y = self.sampler.sample_stdn(
                    mode="train",
                    dates=self.dates,
                    date_cursor=batch_start_idx,
                    n_dates_batch=n_dates_batch,
                    label_daynum=self.forecast_ahead_daynum,
                    long_term_seq_len=self.long_term_seq_len,
                    att_cover_len=self.att_cover_len,
                    short_term_seq_len=self.short_term_seq_len,
                    history_feature_daynum=self.history_feature_daynum,
                    cnn_nbhd_size=self.cnn_nbhd_size)

                logging.info("Sample Completed")
                # logging.info("Attention Flow CNN Feature {0}".format(x_att_flow.shape))    # (6272, 21, 7, 7, 7, 4)
                # logging.info("Attention LSTM External Feature {0}".format(x_att_lstm_ex.shape))  # (6272, 21, 7, 25)
                # logging.info("Short-Term Flow CNN Feature {0}".format(x_flow.shape))    # (6272, 7, 7, 7, 4)
                # logging.info("Short-Term External Feature {0}".format(x_short_term.shape))    # (6272, 7, 25)
                # logging.info("Labels {0}".format(y.shape))     # (6272, 45)
                # logging.info("+" * 50)

                # cache_data = list()
                # cache_data.append(x_att_flow)
                # cache_data.append(x_att_lstm_ex)
                # cache_data.append(x_flow)
                # cache_data.append([x_short_term, ])
                # cache_data.append(y)
                # pickle.dump(cache_data, open(config["saved_input_features"], "wb"))
                # np.save(config["saved_input_features"], np.array(cache_data))

                # model.fit(x=[x_att_flow, x_att_lstm_ex, x_flow, x_short_term], y=y,
                #           batch_size=batch_size, validation_split=config["validation_fraction"],
                #           epochs=epochs, shuffle=False, callbacks=[early_stop])

                feed_x = self.gen_input_to_model(x_att_flow, x_att_lstm_ex,
                                                 x_flow, x_short_term)
                loss_step = model.train_on_batch(feed_x, y)
                logging.info(
                    "training loss at iter {} of epoch {} : {}".format(
                        step, ep, loss_step))
                loss_epoch.append(
                    self.inverse_rmsle(loss_step, self.forecast_ahead_daynum))
                if batch_end_idx >= len(self.dates):
                    losses.append(np.sqrt(
                        np.mean(loss_epoch)))  # loss of whole epoch
                    if len(losses) == self.patience:
                        best_loss = min(losses)
                    elif len(losses) > self.patience:
                        best_loss_tmp = min(losses)
                        if best_loss_tmp >= best_loss:
                            no_improve += 1
                        else:
                            best_loss = best_loss_tmp
                            no_improve = 0
                            model.save(self.config["saved_model"] + "_epoch" +
                                       str(ep + 1) + "_loss" + str(best_loss) +
                                       ".hdf5")
                    if no_improve >= self.patience:
                        stop_flag = True

                if psutil_installed:
                    ps = psutil.Process(os.getpid())
                    self.memory_monitor(ps)
                else:
                    self.memory_monitor_v2()

                batch_start_idx += self.batch_size
                if batch_start_idx >= len(self.dates) - 1:
                    epoch_end = True
                step += 1

            if stop_flag:
                break

        best_epoch = losses.index(min(losses))
        model = tf.keras.models.load_model(self.config["saved_model"] +
                                           "_epoch" + str(best_epoch + 1) +
                                           "_loss" + str(best_loss) + ".hdf5")

        # 给出最后一条样本的预测,去掉这条样本本身就是未来15天
        x_att_flow, x_att_lstm_ex, x_flow, x_short_term, y = self.sampler.sample_stdn(
            mode='prediction',
            dates=self.dates,
            date_cursor=len(self.dates) - 1,
            n_dates_batch=1,
            label_daynum=self.forecast_ahead_daynum,
            long_term_seq_len=self.long_term_seq_len,
            att_cover_len=self.att_cover_len,
            short_term_seq_len=self.short_term_seq_len,
            history_feature_daynum=self.history_feature_daynum,
            cnn_nbhd_size=self.cnn_nbhd_size)
        feed_x = self.gen_input_to_model(x_att_flow, x_att_lstm_ex, x_flow,
                                         x_short_term)
        preds = model.predict(feed_x)
        logging.info("prediction result: {0}".format(preds))
        np.save(self.config["saved_prediction"], preds)
Esempio n. 8
0
        Qsa = getActionStateValue(thetas, features[a], a)
        if Qsa > QsaHighest:
            QsaHighest = Qsa
            aMax = a
    a = aMax
    return a


def getActionStateValue(thetas, Fsa, a):
    # pprint(Fsa)
    # pprint(thetas[a])
    Qsa = sum(f * t for f, t in zip(Fsa, thetas[a]))
    return float(Qsa)


ff = FeatureFactory()
alpha = 0.1
epsilon = 0.1
gamma = 0.9
if __name__ == '__main__':
    interval = choice(intervals)
    for currency in currencies:
        # load data
        opens, highs, lows, closes, volumes = loadData(currency, interval)
        dataSize = len(closes)

        # extract features
        features = ff.getFeatures(opens, highs, lows, closes, volumes)
        # pprint(features)

        # load thetas
Esempio n. 9
0
    if random() < epsilon:
        a = choice(actionsAvailable)
    # exploitation
    else:
        aMax = None
        QsaHighest = -1000
        for a in actionsAvailable:
            Qsa = getActionStateValue(thetas, features[a], a)
            if Qsa > QsaHighest:
                QsaHighest = Qsa
                aMax = a
        a = aMax
    return a


ff = FeatureFactory()
alpha = 0.1
epsilon = 0.1
gamma = 0.9
if __name__ == '__main__':
    interval = '1440'
    # interval = choice(intervals)
    for currency in currencies:
        print '\n', currency, interval

        # load data
        opens, highs, lows, closes, volumes = loadData(currency, interval)
        print 'data loaded'
        dataSize = len(closes)

        # extract features
Esempio n. 10
0
    df = pd.read_csv(
        r'../' + currency + '1440.csv',
        names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'],
    )
    data = df.as_matrix()

    opens = data[:, 2].astype(float)
    highs = data[:, 3].astype(float)
    lows = data[:, 4].astype(float)
    closes = data[:, 5].astype(float)
    volumes = data[:, 6].astype(int)

    # calculating features
    print '\n calculating features...'
    ff = FeatureFactory()
    X_scaled = ff.getFeatures(opens, highs, lows, closes, volumes)
    # print X_scaled

    # set rewards
    # print '\ncalculating rewards...'
    rewards = ff.getRewards(closes)  # print rewards

    # train split
    # print '\nsplitting training set'
    X_train, X_test, y_train, y_test = ff.getSplit(X_scaled, rewards)

    # fitting regressor
    # print '\nfitting regressor'
    clf = ExtraTreesClassifier(
        # n_estimators=50,
Esempio n. 11
0
#all_event_labels = {event_label  for sent in training_set for event_labels in sent["events"] for event_label in event_labels.split(",")}

NUM_EVENT_LABELS = len(all_event_labels)
print("found {} event labels (incl. non-event) in the training dataset".format(NUM_EVENT_LABELS))
# create an event dictionary: {"event":[set of words labelled as this event],..}

# load NOMLEX dictionary
with open(nomlex_file, "r") as f:
	nomlex_dict = json.load(f)

# extract all features from all sentences
fd = defaultdict(int)

for sent in training_set:
	for i,w in enumerate(sent["words"]):
		ff = FeatureFactory(sent, i, nomlex_dict)
		fd.update(ff.extract())

for k in fd:
	fd[k] = 0

print("collected {} features".format(len(fd)))

print("unique features:{}".format(len(set(fd.keys()))))

Scores([["O","Attack","O","O","O","Business"],["O","Business","O","Business","O","Business"]],
	[["O","Business","Attack", "O","O","Business"],["O","Attack","Attack", "O","Attack","Business"]]).show()


nvi = 50
Esempio n. 12
0
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import YearLocator, MonthLocator, DateFormatter
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import cross_val_score
import sklearn as sk
import collections as col
import operator
from features import FeatureFactory
from random import shuffle

ff = FeatureFactory()
features = ff.getNames()
print 'len features = ' + str(len(features))

currencies = [
    'AUDUSD',
    'EURGBP',
    'EURJPY',
    'EURUSD',
    'GBPJPY',
    'GBPUSD',
    'NZDUSD',
    'USDCAD',
    'USDCHF',
    'USDJPY',
]
shuffle(currencies)

for currency in currencies:
Esempio n. 13
0
if __name__ == '__main__':
    # randomly select currency (due to high iteration)
    currency = choice(currencies)
    interval = choice(intervals)
    print currency, interval

    print 'loading dataframe...'
    df = pd.read_csv(
        r'../data/' + currency + interval + '.csv',
        names=['date', 'time', 'open', 'high', 'low', 'close', 'volume'],
        # parse_dates=[[0, 1]],
        # index_col=0,
    )
    print df.tail()

    ff = FeatureFactory()

    # establish and calculate the trade set ups
    print '\n\nprocessing entries...'
    ff.getEntries(df)
    # print df

    # establish exits
    print '\n\nprocessing...'
    trades = ff.process(df)
    # print trades

    wins, losses = 0, 0
    tradeWon = None
    tradeLost = None
    for trade in trades:
Esempio n. 14
0
        a = choice(range(len(ACTIONS)))
    # exploitation
    else:
        aMax = None
        QsaHighest = -1000
        for a, aValue in enumerate(ACTIONS):
            #Qsa = getActionStateValue(thetas, features[a], a)
            Qsa = thetas[a]
            if Qsa > QsaHighest:
                QsaHighest = Qsa
                aMax = a
        a = aMax
    return a


ff = FeatureFactory()
ALPHA = 0.1
EPSILON = 0.1
GAMMA = 0.9
if __name__ == '__main__':
    interval = choice(INTERVALS)
    for currency in CURRENCIES:
        print '\n', currency, interval

        # load data
        df = loadData(currency, interval)
        dfIndex = df.index.values
        #print df

        ACTIONS = createActions(df)
        pprint(ACTIONS)