def get_data_for_selection(self, start_date, end_date): curror = BaseModel('novel_Feature').query( sql=dict(stock_code={'$in': model.li}, date={ '$gte': start_date, '$lte': end_date })) if curror.count(): data = pd.DataFrame(list(curror)) data = data.loc[:, model.names] data = data.replace(to_replace=np.Infinity, value=np.NaN).dropna() # data['change_r_next16'] = data.change_r_next16.map(lambda x: 1 if x > 0 else 0) for cl in model.names: v = data[cl].iloc[0] # tp=type(v) # if type(v).__name__ !='bool': # data[cl] = data.sort_values(by=[cl], ascending=False).index # # temp = data[cl].mean() # data[cl] = data[cl].map(lambda x:1 if x>len(data)//2 else 0) # pass y = data.loc[:, [model.label]] y['sort'] = y.sort_values(by=[model.label], ascending=False).index y = y.drop([model.label], axis=1) y['sort'] = y.sort.map(lambda x: 1 if x > len(y) / 2 else 0) X = data.drop([model.label, 'date', '_id'], axis=1) return X, y else: return [], []
def get_data(self, start_date, end_date): print(start_date) curror = BaseModel('features_index_day').query( sql=dict(stock_code={'$in': model.li}, date={ '$gte': start_date, '$lte': end_date })) if curror.count(): data = pd.DataFrame(list(curror)) data = data.loc[:, model.names + [model.label]] data = data.replace(to_replace=np.Infinity, value=np.NaN).dropna() # data['change_r_next16'] = data.change_r_next16.map(lambda x: 1 if x > 0 else 0) for cl in model.names: v = data[cl].iloc[0] # if type(v).__name__ !='bool': # data[cl] = data.sort_values(by=[cl], ascending=False).index # temp = data[cl].mean() # data[cl] = data[cl].map(lambda x:1 if x>len(data)//2 else 0) # pass y = data.loc[:, [model.label]] y['sort'] = y.sort_values(by=[model.label], ascending=False).index y = y.drop([model.label], axis=1) y['sort'] = y.sort.map(lambda x: self.classify(x)) # y['sort'] = y.sort.map(lambda x:1 if x>len(y)/2 else 0) X = data.drop([model.label], axis=1) # X = data.loc[:, model.names] # X_test = data_test.loc[:, model.names] # y_test = data_test.loc[:, ['change_r_next16']] return X, y else: return pd.DataFrame([]), pd.DataFrame([])
def min60(time): min60_times = [1030, 1130, 1400, 1500] if time in min60_times: temp = time // 100 * 60 + time % 100 - 60 temp = temp // 60 * 100 + temp % 60 print(temp) sql = {'time': {'$gt': temp, '$lte': time}, 'date': Calendar.today()} curror = BaseModel('real_kline_min5').query(sql=sql) if curror.count(): data = pd.DataFrame(list(curror)) data = data.sort_values(by=['time'], ascending=True) data = data.groupby(by=['stock_code'], as_index=False).agg({ 'volume': 'sum', 'amount': 'sum', 'open': 'first', 'close': 'last', 'high': 'max', 'low': 'min' }) data['time'] = time data['date'] = Calendar.today() BaseModel('real_kline_min60').insert_batch( data.to_dict(orient='records')) print('min60 ok') # min60(time=1030)
if __name__ == '__main__': # clf = DecisionTreeClassifier() # clf = LogisticRegression() table = 'features_index_day' clf = RandomForestClassifier(n_estimators=200, n_jobs=2) label = 'change_r_next2' curror = BaseModel(table).query( sql={ 'date': { '$gte': dt.datetime(2018, 1, 5), '$lte': dt.datetime(2018, 7, 25) } }) print(curror.count()) if curror.count(): data = pd.DataFrame(list(curror)) data = data.replace(to_replace=np.Infinity, value=np.NaN).dropna() name = data.columns.values.tolist() names = list() # data=data.sample(frac=0.2) for i in name: if i not in [ 'stock_code', 'date', 'time', 'change_r', 'change_r_next', 'change_r_next16', 'change_r_next2', 'change_r_next3', 'change_r_next4', 'change_r_next48', '_id', 'classtype',
def deal_data(code): data = KlineData.read_data(code=code, start_date=dt.datetime(2015, 1, 1), end_date=dt.datetime(2018, 9, 17), kline='kline_day', timemerge=True) name = data.columns.values.tolist() if len(data): data = data.drop(['_id', 'classtype', 'market'], axis=1) data = n_KDJ(data, 9, 3) # t1 = time.clock() # print(t1-t) # t = time.clock() data = n_MA(data) # t1 = time.clock() # print(t1 - t1) # t = time.clock() data = n_MACD(data, 12, 26, 9) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_RSI(data, 6, 12, 24) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_DMI(data, 14, 6) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_BRAR(data, 26) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_CR(data, 26, 10, 20, 40) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_VR(data, 26) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_WR(data, 10, 6) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_CCI(data, 14) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_BOLL(data, 20) # t1 = time.clock() # print(t1 - t) # t = time.clock() data = n_PSY(data, 12) columns = data.columns.values.tolist() data = data.replace(to_replace=np.Infinity, value=np.NaN) fund_data = pd.DataFrame( list(BaseModel('jq_fund_data').query({'code': code}))) jq_name = [ 'capitalization', 'circulating_cap', 'circulating_market_cap', 'market_cap', 'pb_ratio', 'pcf_ratio', 'pe_ratio', 'pe_ratio_lyr', 'ps_ratio', 'turnover_ratio' ] for i in jq_name: data[i] = 0 # data['circulating_market_cap'] = 0 for i in columns: if i not in name and 'change_r_next' not in i: v = data[i].iloc[0] if type(v).__name__ != 'bool_': # print(i, v, type(v)) if data[i].min() == 0: # pass # print(i, data[i].loc[0]) data.loc[data[i] != 0, i] = (data.loc[data[i] != 0, i] - data.loc[data[i] != 0, i].min()) / ( data.loc[data[i] != 0, i].max() - data.loc[data[i] != 0, i].min()) # print(i,data[i].loc[0]) else: # print(i, v, type(v)) data[i] = (data[i] - data[i].min()) / (data[i].max() - data[i].min()) # data['up'] = data['up'] / (data['up'] + data['down']) # data['down'] = data['down'] / (data['up'] + data['down']) data.loc[ (data.date >= dt.datetime(2015, 12, 31)) & (data.date < dt.datetime(2016, 3, 31)), jq_name] = \ fund_data[fund_data.day == '2015-12-31'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2016, 3, 31)) & (data.date <= dt.datetime(2016, 6, 30)), jq_name] = \ fund_data[fund_data.day == '2016-03-31'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date < dt.datetime(2016, 9, 30)) & (data.date >= dt.datetime(2016, 6, 30)), jq_name] = \ fund_data[fund_data.day == '2016-06-30'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2016, 9, 30)) & (data.date < dt.datetime(2016, 12, 30)), jq_name] = \ fund_data[fund_data.day == '2016-09-30'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date < dt.datetime(2017, 3, 31)) & (data.date >= dt.datetime(2016, 12, 30)), jq_name] = \ fund_data[fund_data.day == '2016-12-30'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2017, 3, 31)) & (data.date < dt.datetime(2017, 6, 30)), jq_name] = \ fund_data[fund_data.day == '2017-03-31'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2017, 6, 30)) & (data.date < dt.datetime(2017, 9, 29)), jq_name] = \ fund_data[fund_data.day == '2017-06-30'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2017, 9, 29)) & (data.date < dt.datetime(2017, 12, 29)), jq_name] = \ fund_data[fund_data.day == '2017-09-29'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2017, 12, 29)) & (data.date < dt.datetime(2018, 3, 30)), jq_name] = \ fund_data[fund_data.day == '2017-12-29'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2018, 3, 30)) & (data.date < dt.datetime(2018, 6, 29)), jq_name] = \ fund_data[fund_data.day == '2018-03-30'].loc[:,jq_name].iloc[0].tolist() data.loc[ (data.date >= dt.datetime(2018, 6, 29)), jq_name] = \ fund_data[fund_data.day == '2018-06-29'].loc[:,jq_name].iloc[0].tolist() name = ['open', 'close', 'high', 'low', 'amount', 'volume'] data['turnover_ratio'] = data.volume / (data.circulating_cap * 10000) for i in name: data[i] = (data[i] - data[i].min()) / (data[i].max() - data[i].min()) data = data[::-1] data = data.reset_index(drop=True) # data['count'] = data.index.tolist() curror = BaseModel('LaunchDate').query(sql={'stock_code': code}) data['count'] = 0 if curror.count(): launchdate = list(curror)[0]['date'] data['count'] = data.date.map(lambda x: (x - launchdate).days) data = data[data.date >= dt.datetime(2016, 1, 1)] BaseModel('features_kline_day').insert_batch( data.to_dict(orient='records'))
from sklearn import tree import pandas as pd import datetime as dt from Calf.models.base_model import BaseModel from sklearn.metrics import classification_report if __name__ == '__main__': curror = BaseModel('novel_Feature').query( sql=dict(stock_code=1, date={ '$gte': dt.datetime(2018, 4, 15), '$lte': dt.datetime(2018, 4, 23) })) print(curror.count()) data = pd.DataFrame(list(curror)) data['change_r_next'] = data.change_r_next.map(lambda x: 1 if x > 0 else 0) data_test = data[data.date == dt.datetime(2018, 4, 23)] data = data[data.date < dt.datetime(2018, 4, 23)] y = data.loc[:, ['change_r_next']] X = data.drop(['_id', 'change_r_next', 'date'], axis=1) X_test = data_test.drop(['_id', 'change_r_next', 'date'], axis=1) y_test = data_test.loc[:, ['change_r_next']] clf = tree.DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None, max_leaf_nodes=None, max_features=None,