Ejemplo n.º 1
0
    def get_data_for_selection(self, start_date, end_date):
        curror = BaseModel('novel_Feature').query(
            sql=dict(stock_code={'$in': model.li},
                     date={
                         '$gte': start_date,
                         '$lte': end_date
                     }))
        if curror.count():
            data = pd.DataFrame(list(curror))

            data = data.loc[:, model.names]
            data = data.replace(to_replace=np.Infinity, value=np.NaN).dropna()
            # data['change_r_next16'] = data.change_r_next16.map(lambda x: 1 if x > 0 else 0)
            for cl in model.names:
                v = data[cl].iloc[0]
                # tp=type(v)
                # if type(v).__name__ !='bool':

                # data[cl] = data.sort_values(by=[cl], ascending=False).index
            #       # temp = data[cl].mean()
            #       data[cl] = data[cl].map(lambda x:1 if x>len(data)//2 else 0)
            #       pass
            y = data.loc[:, [model.label]]
            y['sort'] = y.sort_values(by=[model.label], ascending=False).index
            y = y.drop([model.label], axis=1)
            y['sort'] = y.sort.map(lambda x: 1 if x > len(y) / 2 else 0)
            X = data.drop([model.label, 'date', '_id'], axis=1)
            return X, y
        else:
            return [], []
Ejemplo n.º 2
0
    def get_data(self, start_date, end_date):
        print(start_date)
        curror = BaseModel('features_index_day').query(
            sql=dict(stock_code={'$in': model.li},
                     date={
                         '$gte': start_date,
                         '$lte': end_date
                     }))
        if curror.count():
            data = pd.DataFrame(list(curror))

            data = data.loc[:, model.names + [model.label]]
            data = data.replace(to_replace=np.Infinity, value=np.NaN).dropna()
            # data['change_r_next16'] = data.change_r_next16.map(lambda x: 1 if x > 0 else 0)
            for cl in model.names:
                v = data[cl].iloc[0]
                # if type(v).__name__ !='bool':

                # data[cl] = data.sort_values(by=[cl], ascending=False).index
                # temp = data[cl].mean()
                # data[cl] = data[cl].map(lambda x:1 if x>len(data)//2 else 0)
                # pass
            y = data.loc[:, [model.label]]
            y['sort'] = y.sort_values(by=[model.label], ascending=False).index
            y = y.drop([model.label], axis=1)
            y['sort'] = y.sort.map(lambda x: self.classify(x))
            # y['sort'] = y.sort.map(lambda x:1 if x>len(y)/2 else 0)
            X = data.drop([model.label], axis=1)
            # X = data.loc[:, model.names]
            # X_test = data_test.loc[:, model.names]
            # y_test = data_test.loc[:, ['change_r_next16']]

            return X, y
        else:
            return pd.DataFrame([]), pd.DataFrame([])
Ejemplo n.º 3
0
def min60(time):
    min60_times = [1030, 1130, 1400, 1500]
    if time in min60_times:
        temp = time // 100 * 60 + time % 100 - 60
        temp = temp // 60 * 100 + temp % 60
        print(temp)
        sql = {'time': {'$gt': temp, '$lte': time}, 'date': Calendar.today()}
        curror = BaseModel('real_kline_min5').query(sql=sql)
        if curror.count():
            data = pd.DataFrame(list(curror))

            data = data.sort_values(by=['time'], ascending=True)
            data = data.groupby(by=['stock_code'], as_index=False).agg({
                'volume':
                'sum',
                'amount':
                'sum',
                'open':
                'first',
                'close':
                'last',
                'high':
                'max',
                'low':
                'min'
            })

            data['time'] = time
            data['date'] = Calendar.today()
            BaseModel('real_kline_min60').insert_batch(
                data.to_dict(orient='records'))
            print('min60 ok')


# min60(time=1030)
Ejemplo n.º 4
0
if __name__ == '__main__':
    # clf = DecisionTreeClassifier()
    # clf = LogisticRegression()
    table = 'features_index_day'
    clf = RandomForestClassifier(n_estimators=200, n_jobs=2)
    label = 'change_r_next2'
    curror = BaseModel(table).query(
        sql={
            'date': {
                '$gte': dt.datetime(2018, 1, 5),
                '$lte': dt.datetime(2018, 7, 25)
            }
        })

    print(curror.count())
    if curror.count():

        data = pd.DataFrame(list(curror))

        data = data.replace(to_replace=np.Infinity, value=np.NaN).dropna()
        name = data.columns.values.tolist()
        names = list()

        # data=data.sample(frac=0.2)

        for i in name:
            if i not in [
                    'stock_code', 'date', 'time', 'change_r', 'change_r_next',
                    'change_r_next16', 'change_r_next2', 'change_r_next3',
                    'change_r_next4', 'change_r_next48', '_id', 'classtype',
Ejemplo n.º 5
0
def deal_data(code):
    data = KlineData.read_data(code=code,
                               start_date=dt.datetime(2015, 1, 1),
                               end_date=dt.datetime(2018, 9, 17),
                               kline='kline_day',
                               timemerge=True)

    name = data.columns.values.tolist()
    if len(data):
        data = data.drop(['_id', 'classtype', 'market'], axis=1)
        data = n_KDJ(data, 9, 3)
        # t1 = time.clock()
        # print(t1-t)
        # t = time.clock()
        data = n_MA(data)
        # t1 = time.clock()
        # print(t1 - t1)
        # t = time.clock()
        data = n_MACD(data, 12, 26, 9)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_RSI(data, 6, 12, 24)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_DMI(data, 14, 6)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_BRAR(data, 26)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_CR(data, 26, 10, 20, 40)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_VR(data, 26)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_WR(data, 10, 6)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_CCI(data, 14)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_BOLL(data, 20)
        # t1 = time.clock()
        # print(t1 - t)
        # t = time.clock()
        data = n_PSY(data, 12)
        columns = data.columns.values.tolist()
        data = data.replace(to_replace=np.Infinity, value=np.NaN)

        fund_data = pd.DataFrame(
            list(BaseModel('jq_fund_data').query({'code': code})))
        jq_name = [
            'capitalization', 'circulating_cap', 'circulating_market_cap',
            'market_cap', 'pb_ratio', 'pcf_ratio', 'pe_ratio', 'pe_ratio_lyr',
            'ps_ratio', 'turnover_ratio'
        ]
        for i in jq_name:
            data[i] = 0
        # data['circulating_market_cap'] = 0

        for i in columns:
            if i not in name and 'change_r_next' not in i:
                v = data[i].iloc[0]
                if type(v).__name__ != 'bool_':
                    # print(i, v, type(v))
                    if data[i].min() == 0:
                        # pass
                        # print(i, data[i].loc[0])
                        data.loc[data[i] != 0,
                                 i] = (data.loc[data[i] != 0, i] -
                                       data.loc[data[i] != 0, i].min()) / (
                                           data.loc[data[i] != 0, i].max() -
                                           data.loc[data[i] != 0, i].min())
                        # print(i,data[i].loc[0])
                    else:
                        # print(i, v, type(v))
                        data[i] = (data[i] - data[i].min()) / (data[i].max() -
                                                               data[i].min())

        # data['up'] = data['up'] / (data['up'] + data['down'])
        # data['down'] = data['down'] / (data['up'] + data['down'])
        data.loc[
            (data.date >= dt.datetime(2015, 12, 31)) & (data.date < dt.datetime(2016, 3, 31)), jq_name] = \
            fund_data[fund_data.day == '2015-12-31'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2016, 3, 31)) & (data.date <= dt.datetime(2016, 6, 30)), jq_name] = \
            fund_data[fund_data.day == '2016-03-31'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date < dt.datetime(2016, 9, 30)) & (data.date >= dt.datetime(2016, 6, 30)), jq_name] = \
            fund_data[fund_data.day == '2016-06-30'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2016, 9, 30)) & (data.date < dt.datetime(2016, 12, 30)), jq_name] = \
            fund_data[fund_data.day == '2016-09-30'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date < dt.datetime(2017, 3, 31)) & (data.date >= dt.datetime(2016, 12, 30)), jq_name] = \
            fund_data[fund_data.day == '2016-12-30'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2017, 3, 31)) & (data.date < dt.datetime(2017, 6, 30)), jq_name] = \
            fund_data[fund_data.day == '2017-03-31'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2017, 6, 30)) & (data.date < dt.datetime(2017, 9, 29)), jq_name] = \
            fund_data[fund_data.day == '2017-06-30'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2017, 9, 29)) & (data.date < dt.datetime(2017, 12, 29)), jq_name] = \
            fund_data[fund_data.day == '2017-09-29'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2017, 12, 29)) & (data.date < dt.datetime(2018, 3, 30)), jq_name] = \
            fund_data[fund_data.day == '2017-12-29'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2018, 3, 30)) & (data.date < dt.datetime(2018, 6, 29)), jq_name] = \
            fund_data[fund_data.day == '2018-03-30'].loc[:,jq_name].iloc[0].tolist()
        data.loc[
            (data.date >= dt.datetime(2018, 6, 29)), jq_name] = \
            fund_data[fund_data.day == '2018-06-29'].loc[:,jq_name].iloc[0].tolist()
        name = ['open', 'close', 'high', 'low', 'amount', 'volume']
        data['turnover_ratio'] = data.volume / (data.circulating_cap * 10000)
        for i in name:
            data[i] = (data[i] - data[i].min()) / (data[i].max() -
                                                   data[i].min())
        data = data[::-1]

        data = data.reset_index(drop=True)
        # data['count'] = data.index.tolist()
        curror = BaseModel('LaunchDate').query(sql={'stock_code': code})
        data['count'] = 0
        if curror.count():
            launchdate = list(curror)[0]['date']
            data['count'] = data.date.map(lambda x: (x - launchdate).days)

        data = data[data.date >= dt.datetime(2016, 1, 1)]
        BaseModel('features_kline_day').insert_batch(
            data.to_dict(orient='records'))
Ejemplo n.º 6
0
from sklearn import tree

import pandas as pd
import datetime as dt

from Calf.models.base_model import BaseModel
from sklearn.metrics import classification_report

if __name__ == '__main__':
    curror = BaseModel('novel_Feature').query(
        sql=dict(stock_code=1,
                 date={
                     '$gte': dt.datetime(2018, 4, 15),
                     '$lte': dt.datetime(2018, 4, 23)
                 }))
    print(curror.count())
    data = pd.DataFrame(list(curror))

    data['change_r_next'] = data.change_r_next.map(lambda x: 1 if x > 0 else 0)
    data_test = data[data.date == dt.datetime(2018, 4, 23)]
    data = data[data.date < dt.datetime(2018, 4, 23)]

    y = data.loc[:, ['change_r_next']]
    X = data.drop(['_id', 'change_r_next', 'date'], axis=1)
    X_test = data_test.drop(['_id', 'change_r_next', 'date'], axis=1)
    y_test = data_test.loc[:, ['change_r_next']]
    clf = tree.DecisionTreeClassifier(class_weight=None,
                                      criterion='gini',
                                      max_depth=None,
                                      max_leaf_nodes=None,
                                      max_features=None,