def save_csv_output(out_dir: Path, tag: str, df_T9c: DataFrame, df_T9r: DataFrame) -> None: """ Wrapper to save csv files (counts.csv and weights.csv) to matrix output folder. Args: out_dir (Path): path to matrix output folder df_T10_counts (DataFrame): activity dataframe T10_counts containing counts per task. df_T3_mapped (DataFrame): Mapped weight tabel T3 """ df_T9c = df_T9c.rename( columns={ ("cont_classification_task_id"): "task_id", ("assay_type"): "task_type", ("weight"): "training_weight", }) df_T9c = df_T9c.dropna(subset=["task_id"]).sort_values("task_id") df_T9c["task_id"] = df_T9c["task_id"].astype(int) df_T9r = df_T9r.rename( columns={ ("cont_regression_task_id"): "task_id", ("assay_type"): "task_type", ("weight"): "training_weight", }) df_T9r = df_T9r.dropna(subset=["task_id"]).sort_values("task_id") df_T9r["task_id"] = df_T9r["task_id"].astype(int) if tag == "cls": out_dir_cls = out_dir / "cls" out_dir_cls.mkdir(exist_ok=True) save_df_as_csv( out_dir_cls, df_T9c, "cls_weights", ["task_id", "task_type", "training_weight", "aggregation_weight"], ) out_dir_reg = out_dir / "reg" out_dir_reg.mkdir(exist_ok=True) save_df_as_csv( out_dir_reg, df_T9r, "reg_weights", [ "task_id", "task_type", "training_weight", "aggregation_weight", "censored_weight", ], ) if tag == "clsaux": out_dir_clsaux = out_dir / "clsaux" out_dir_clsaux.mkdir(exist_ok=True) save_df_as_csv( out_dir_clsaux, df_T9c, "clsaux_weights", ["task_id", "task_type", "training_weight", "aggregation_weight"], )
def filter_by_mad(data: npt.NDArray, m: float = 2) -> npt.NDArray: """ This aux function will take a numpy array, calculate median and MAD, and filter the data removing outliers. """ # turn data into df_output to do mad calculations df_output = DataFrame(np.array(data), columns=['Data']) median = df_output['Data'].median(axis=0) mad = df_output['Data'].mad(axis=0) df_output['Abs_Dev'] = np.abs(data - median) / mad # filter values m times away from median, by default m = 2 df_output['Abs_Dev'].mask(df_output['Abs_Dev'] > m, inplace=True) # mask values df_output.dropna(how='any', inplace=True) # eliminte NANs return df_output['Data'].to_numpy()
def pre_processing(df: DataFrame): """ input : a data frame outputs: clean data frame dtype.txt : a file that has type of each columns database:information.sqlite tables: information : clean data frame before_process : data before process missing_information : information of missing_data function output outliers : outliers data describe : describe of clean data Description: delete null information merge capital_gain and capital_loss delete education column delete outlier information with IQR method save information in database """ sql_manager = SqlManager("information.sqlite") df.to_sql(name="before_process", con=sql_manager.conn, if_exists="replace") missing_data_df = missing_data(df) missing_data_df.to_sql(name="missing_information", con=sql_manager.conn, if_exists="replace") df = df.drop(columns=[ "status_id", "status_published", 'Column1', "Column2", "Column3", "Column4" ]) main_df = df.dropna() print(main_df.shape) outliers_df, main_df = drop_numerical_outliers(main_df) main_df = main_df[main_columns] outliers_df.to_sql(name="outliers", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) main_df.to_sql(name="after_clear", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) label_encode(main_df) scaled_df = DataFrame(preprocessing.robust_scale(main_df), columns=main_columns) scaled_df.to_sql(name="information", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) print(main_df.shape) main_df.describe().to_sql(name="describe", con=sql_manager.conn, if_exists='replace') create_folder("outs") with open("outs\\dtypes.txt", "w") as file: file.write(str(main_df.dtypes)) return main_df
def pre_processing(df: DataFrame): """ input : a data frame outputs: clean data frame dtype.txt : a file that has type of each columns database:information.sqlite tables: information : clean data frame before_process : data before process missing_information : information of missing_data function output outliers : outliers data describe : describe of clean data Description: delete null information merge capital_gain and capital_loss delete education column delete outlier information with IQR method save information in database """ sql_manager = SqlManager("information.sqlite") df.to_sql(name="before_process", con=sql_manager.conn, if_exists="replace") df.replace('?', np.NaN, inplace=True) missing_data_df = missing_data(df) missing_data_df.to_sql(name="missing_information", con=sql_manager.conn, if_exists="replace") main_df = df.dropna() print(main_df.shape) main_df = main_df.drop(columns=['education',"fnlwgt"]) outliers_df, main_df = drop_numerical_outliers(main_df) main_df["capital"] = main_df["capital_gain"] - main_df["capital_loss"] main_df = main_df.drop(columns=["capital_gain", "capital_loss" ]) main_df = main_df[['age', 'workclass', 'education_num', 'marital_status', 'post', 'relationship', 'nation', 'gender', 'capital', 'hours_per_week', 'country', 'wealth']] outliers_df.to_sql(name="outliers", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) main_df.to_sql(name="information", con=SqlManager("information.sqlite").conn, if_exists="replace", index=False) print(main_df.shape) main_df.describe().to_sql(name="describe", con=sql_manager.conn, if_exists='replace') with open("outs\\dtypes.txt", "w") as file: file.write(str(main_df.dtypes)) return main_df
def select_size_filtered_data( data_size_filtered: DataFrame, data_grouped: DataFrame ) -> DataFrame: """ Get non-duplicated and quorum-consitent entries Args: data_size_filtered (DataFrame): Data fulfilling the quorum data_grouped (DataFrame): grouped and non-duplicated data Returns: DataFrame: non-duplicated data with label counts > quorum. """ data_filtered = data_size_filtered.dropna().astype("int32") data_filtered_stacked = data_filtered.stack().reset_index() data_filtered_out = data_grouped[ data_grouped.classification_task_id.isin( data_filtered_stacked.classification_task_id ) ] return data_filtered_out
def strat_maLong_maShort(df=readYahoo('SPY'), maLongDays=10, maShortDays=3, closeCol='Close', highCol='High', lowCol='Low', openCol='Open', signOfTrade=1, printit=True, block=False): ''' execute strategy which enters and exit based on Moving Average crossovers Example: from pystrats.state_strats import strat_maLong_maShort as ss dfretfinal = ss() #strat_maLong_maShort() print dfretfinal print dfretfinal['ret'].mean() ''' close = np.array(df[closeCol]) high = np.array(df[highCol]) low = np.array(df[lowCol]) open = np.array(df[openCol]) date = np.array(df['Date']) ma10 = rolling_mean(close, maLongDays) ma9 = rolling_mean(close, maLongDays - 1) ma3 = rolling_mean(close, maShortDays) ma2 = rolling_mean(close, maShortDays - 1) n = len(df) nl = n - 1 # pMa10 = dsInsert(ma10[0:nl],0,None) # pMa9 = dsInsert(ma9[0:nl],0,None) # pMa3 = dsInsert(ma3[0:nl],0,None) # pMa2 = dsInsert(ma2[0:nl],0,None) pMa10 = np.insert(ma10[0:nl], 0, None) pMa9 = np.insert(ma9[0:nl], 0, None) pMa3 = np.insert(ma3[0:nl], 0, None) pMa2 = np.insert(ma2[0:nl], 0, None) pClose = np.insert(close[0:nl], 0, None) pHigh = np.insert(high[0:nl], 0, None) pLow = np.insert(low[0:nl], 0, None) # initialize state vector state = np.array([1] * n) #loop start_i = maLongDays + 1 for i in range(start_i, n): if (pClose[i] < pMa10[i]) & (state[i - 1] == 1) & (high[i] > pMa9[i]): state[i] = 2 elif (state[i - 1] == 2) & (low[i] > pMa2[i]): state[i] = 2 elif (state[i - 1] == 2) & (low[i] <= pMa2[i]): state[i] = 1 pState = np.insert(state[0:nl], 0, 1) # create entry conditions # 1. initial entry (state 1 to state 2) e1_2 = np.array((pState == 1) & (state == 2)) e2_2 = np.array((pState == 2) & (state == 2)) e2_1 = np.array((pState == 2) & (state == 1)) dfret = DataFrame([date, pHigh, pLow, pClose, pMa10, pMa9, pMa3, pMa2]).T dfret.columns = [ 'Date', 'pHigh', 'pLow', 'pClose', 'pMa10', 'pMa9', 'pMa3', 'pMa2' ] #create daily entry prices dailyEntryPrices = np.array([0] * n) # default entry dailyEntryPrices = asb(dailyEntryPrices, pMa9, e1_2) useCloseOnEntry = e1_2 & (low > pMa9) dailyEntryPrices = asb(dailyEntryPrices, close, useCloseOnEntry) dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_2) dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_1) dfret['entry'] = dailyEntryPrices # create DAILY settle prices, which are either 0 or the Close # dfret$Close <- close dailySettlePrices = np.array([0] * n) dailySettlePrices = asb(dailySettlePrices, close, e1_2) #<- close[w1_2] dailySettlePrices = asb(dailySettlePrices, close, e2_2) #dailySettlePrices[w2_2] <- close[w2_2] dailySettlePrices = asb(dailySettlePrices, pMa2, e2_1) #dailySettlePrices[w2_1] <- pMa2[w2_1] # adjust for situations where the high is below the pMa2, so you get out at the close useCloseOnExit = e2_1 & (high < pMa2) dailySettlePrices = asb( dailySettlePrices, close, useCloseOnExit ) #dailySettlePrices[useCloseOnExit] <- close[useCloseOnExit] dfret['exit'] = dailySettlePrices dfret['ret'] = dfret['exit'] / dfret['entry'] - 1 dfret['ret'].fillna(0) dfretfinal = dfret.dropna(0) #dfretfinal <- dfret[-badrows(dfret),] if printit: retDf = DataFrame({ 'Date': dfretfinal['Date'], 'ret': dfretfinal['ret'] }) returnsPerformance(retDf, block=block) return dfretfinal
def wordTrendTest(self, cur=None, basic_path=None, word_trend_files=None, stock_id=None, start_date=None, end_date=None, rate=3): if cur == None or word_trend_files == None or stock_id == None or start_date == None or end_date == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) for k in range(len(word_trend_files)): word_trend_files[k] = os.path.join(basic_path, word_trend_files[k]) output_path = os.path.join(basic_path, word_trend_files[0] + '.test') if not os.path.exists(output_path): cur.execute( "SELECT stock_id, percentage_difference, date FROM history WHERE stock_id in (%s) and date between '%s' and '%s' order by stock_id, date " % (stock_id, start_date, end_date)) history_t = cur.fetchall() history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'percentage_difference': history_temp[1], 'date': history_temp[2] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) left_shift = 2 stocks = {} for k in history['percentage_difference'].keys(): date = str(history['date'][k]) if date not in stocks and ( k - left_shift) in history['date'] and ( history['percentage_difference'][k] >= rate or history['percentage_difference'][k] <= -rate): stocks[date] = [ str(history['date'][k - left_shift]), history['percentage_difference'][k] ] date_str_arr = [] date_rate = {} for k in stocks: date_str_arr.append('"%s"' % stocks[k][0]) date_rate[stocks[k][0]] = stocks[k][1] date_str = ",".join(date_str_arr) cur.execute( "SELECT id, content FROM news WHERE time in (%s) order by time, content" % (date_str)) news_temp = cur.fetchall() news = {} for n in news_temp: news[str(n[0])] = n[1] del news_temp cur.execute( "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time in (%s) group by time" % (date_str)) news_time = cur.fetchall() ys = [] for f in word_trend_files: word_trend = {} word_trend_temp = pd.read_csv(f) for k in word_trend_temp["0"].keys(): word_trend[word_trend_temp["0"][k]] = [ word_trend_temp["1"][k], word_trend_temp["2"][k] ] if int(word_trend['total_words'][0]) == 0 or int( word_trend['total_words'][1]) == 0: return None p_up = word_trend['total_words'][0] / ( word_trend['total_words'][0] + word_trend['total_words'][1]) p_down = word_trend['total_words'][1] / ( word_trend['total_words'][0] + word_trend['total_words'][1]) date_trend = {} i = 0 print(len(news_time)) for nt in news_time: date = str(nt[1]) date_trend[date] = 0 if date not in date_rate: continue sids = nt[0].split(",") for sid in sids: sid = str(sid) if sid not in news: continue words = self.jiebafenci(news[sid]) wp_up = p_up wp_down = p_down for w in words: if w not in word_trend: wp_up *= (1 / word_trend['total_words'][0]) wp_down *= (1 / word_trend['total_words'][1]) else: if word_trend[w][0] > 0: wp_up *= word_trend[w][0] else: wp_up *= (1 / word_trend['total_words'][0]) if word_trend[w][1] > 0: wp_down *= word_trend[w][1] else: wp_down *= (1 / word_trend['total_words'][1]) while True: if wp_up < 1 and wp_down < 1: wp_up *= 10 wp_down *= 10 else: break date_trend[date] += wp_up / (wp_up + wp_down) if len(sids) != 0: date_trend[date] /= len(sids) i += 1 print(i) data = [] for k in date_trend: data.append([ k, float(date_rate[k]), float('%.2f' % date_trend[k]), float('%.2f' % (1 - float('%.2f' % date_trend[k]))) ]) ys.append(data) data = ys pd.DataFrame(data).to_csv(output_path, index=False, mode="a", encoding="utf-8") else: data = pd.read_csv(output_path) word_trend_num = 0 for i in data: word_trend_num = len(data[i]) data[i] = data[i].apply(eval) data_temp = [] for k in range(word_trend_num): data_temp.append([]) for i in data: for j in range(word_trend_num): data_temp[j].append(data[i][j]) data = data_temp fields = [ ['red', 'Rate '], ['green', 'Word trend '], ['purple', 'Word trend '], ['yellow', 'Word trend '], ] x = range(len(data[0])) y = [] for k in range(len(data) + 1): y.append([]) for d in data[0]: y[0].append(d[1]) index = 0 for d in data: for dd in d: y[index + 1].append(float('%.2f' % ((dd[2] - dd[3]) * 10))) index += 1 # 画图 以折线图表示结果 plt.figure('fig1') plt.title('word trend & rate') print(x) print(y) index = 0 for k in range(len(y)): if index == 0: plt.plot(x, y[k], color=fields[k][0], label=fields[k][1]) else: plt.plot(x, y[k], color=fields[k][0], label=fields[k][1] + str(index)) index += 1 plt.legend() # 显示图例 plt.xlabel('date') plt.ylabel('data') plt.tick_params(labelsize=6) plt.show()
match_date, team_home, team_visiting, game_type, hand_score, win_rate, draw_rate, lost_rate, big_small_ball, home_score, visiting_score, result, big_small_ball_line, total_score, win_score, score_type ] data2.append(result_list) print('%s增加成功' % team_home) frame_data2 = DataFrame(data2) frame_data2.rename(columns={ 0: 'match_date', 1: 'team_home', 2: 'team_visiting', 3: 'game_type', 4: 'hand_score', 5: 'win_rate', 6: 'draw_rate', 7: 'lost_rate', 8: 'big_small_ball', 9: 'home_score', 10: 'visiting_score', 11: 'result', 12: 'big_small_ball_line', 13: 'total_score', 14: 'win_score', 15: 'score_type' }, inplace=True) frame_data3 = frame_data2.dropna(axis=0, how='any') frame_data3.to_csv(path2, encoding='utf-8', header=None)
def _stack_multi_columns(frame, level=-1, dropna=True): this = frame.copy() # this makes life much simpler if level != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level, frame.columns.nlevels - 1): roll_columns = roll_columns.swaplevel(i, i + 1) this.columns = roll_columns if not this.columns.is_lexsorted(): this = this.sortlevel(0, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = zip(*[ lev.values.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ]) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] levsize = len(level_vals) for key in unique_groups: loc = this.columns.get_loc(key) # can make more efficient? if loc.stop - loc.start != levsize: chunk = this.ix[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals).values else: if frame._is_mixed_type: value_slice = this.ix[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(frame.columns.levels[level]) new_labels.append(np.tile(np.arange(levsize), N)) new_names.append(frame.columns.names[level]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
class MainApp(QMainWindow, ui): def __init__(self): super().__init__() QMainWindow.__init__(self) self.setupUi(self) self.HandleButtons() self.InitUI() #Store dataset to this self.data_train = DataFrame() self.data_test = DataFrame() self.columnsRemove = [] self.data_cleaned = DataFrame() self.train = True def InitUI(self): self.tabWidget.tabBar().setVisible(False) #Disabling remove columns before loading dataset for training self.listWidget_data_train.setEnabled(False) style = open('./themes/default.css', 'r') style = style.read() self.setStyleSheet(style) def HandleButtons(self): self.button_data_train.clicked.connect(self.HandleTrainBrowse) self.button_data_test.clicked.connect(self.HandleRunBrowse) self.button_drop.clicked.connect(self.RemoveColumn) self.button_drop_2.clicked.connect(self.RemoveColumn) self.button_train.clicked.connect(self.TrainModel) self.button_run.clicked.connect(self.RunModel) self.pushButton.clicked.connect(self.Open_Create) self.pushButton_2.clicked.connect(self.Open_Run) self.pushButton_3.clicked.connect(self.Open_Summary) self.pushButton_4.clicked.connect(self.open_Settings) self.button_model.clicked.connect(self.HandleModelBrowse) self.button_summary.clicked.connect(self.Summary) self.button_darkblue.clicked.connect(self.Apply_DarkBlue_Style) self.button_darkorange.clicked.connect(self.Apply_DarkOrange_Style) self.button_dark.clicked.connect(self.Apply_QDark_Style) self.button_darkgray.clicked.connect(self.Apply_DarkGray_Style) def GetLocation(self, operation: str, filter: str, caption: str) -> str: ''' Get file location either save or open file ''' if operation == 'open': return QFileDialog.getOpenFileName(self, caption=caption, directory='.', filter=filter)[0].strip() elif operation == 'save': return QFileDialog.getSaveFileName(self, caption=caption, directory='.', filter=filter)[0].strip() def HandleTrainBrowse(self): ## enable browseing to our os , pick save location save_location: str = self.GetLocation(operation='open', caption="Open", filter="CSV Files(*.csv)") print(save_location) if (save_location != ''): self.lineEdit_data_train.setText(str(save_location)) #display columns in listWidget self.data_train = pd.read_csv(self.lineEdit_data_train.text()) cols = self.data_train.columns.values.tolist() print(cols) self.listWidget_data_train.addItems(cols) self.listWidget_data_train.setEnabled(True) self.button_drop.setEnabled(True) self.train = True def HandleModelBrowse(self): self.model_location = self.GetLocation(operation='open', caption="Open", filter="JobLib Files(*.joblib)") if (self.model_location != ''): self.lineEdit_model.setText(str(self.model_location)) def HandleRunBrowse(self): ## enable browseing to our os , pick save location data_location = self.GetLocation(operation='open', caption="Open", filter="CSV Files(*.csv)") if data_location != '': self.lineEdit_data_test.setText(str(data_location)) #display columns in listWidget self.data_test = pd.read_csv(self.lineEdit_data_test.text()) cols = self.data_test.columns.values.tolist() print(cols) self.listWidget_data_test.addItems(cols) self.listWidget_data_test.setEnabled(True) self.button_drop_2.setEnabled(True) self.train = False def RemoveColumn(self): if (self.train): items = self.listWidget_data_train.selectedItems() list = self.listWidget_data_train data = self.data_train else: items = self.listWidget_data_test.selectedItems() list = self.listWidget_data_test data = self.data_test if items is None: return reply = QMessageBox.question( self, "Drop", "Remove`{0}'?".format(' '.join(map(lambda item: item.text(), items))), QMessageBox.Yes | QMessageBox.No, QMessageBox.No) if reply == QMessageBox.Yes: for item in items: row = list.row(item) item = list.takeItem(row) self.columnsRemove.append(item.text()) del item #Delete from dataframe only in training self.data_cleaned = data.drop(columns=self.columnsRemove, inplace=self.train) def TrainModel(self): print(self.data_train.columns) self.listWidget_data_train.clear() self.columnsRemove.clear() save_location = self.GetLocation(operation='save', caption="Save as", filter="JobLib Files(*.joblib)") if save_location != '': print(save_location, 'model train start') #train model self.data_train.dropna(inplace=True) self.data_train.drop_duplicates(inplace=True) X = pd.get_dummies(self.data_train) kmeans = KMeans(init='k-means++', max_iter=300, n_init=10, random_state=4) scaler = MinMaxScaler() scaled_features = scaler.fit_transform(X) visualizer = KElbowVisualizer(kmeans, k=(4, 12), metric='silhouette', timings=False) visualizer.fit(X) if (not visualizer.elbow_value_): clusterValue = 3 else: clusterValue = visualizer.elbow_value_ kmeans = KMeans(max_iter=300, n_init=10, random_state=4, n_clusters=clusterValue) print(clusterValue) kmeans.fit(scaled_features) #save model dump(kmeans, save_location + '.joblib') print('model train done') def RunModel(self): print(self.data_cleaned.columns) self.listWidget_data_test.clear() self.model = load(self.model_location) self.columnsRemove.clear() self.data_cleaned.dropna(inplace=True) self.data_cleaned.drop_duplicates(inplace=True) X = pd.get_dummies(self.data_cleaned) scaler = MinMaxScaler() scaled_features = scaler.fit_transform(X) y_means = self.model.predict(scaled_features) self.data_cleaned['Cluster'] = y_means self.data_cleaned.to_csv('output.csv') def Summary(self): data_location = self.GetLocation('open', 'CSV Files(*.csv)', 'Open') if data_location != '': self.lineEdit_summary.setText(data_location) df = pd.read_csv(data_location) summary_df = df.describe() #Row count row = summary_df.shape[0] self.tableWidget.setRowCount(row) #Column count column = summary_df.shape[1] self.tableWidget.setColumnCount(column) self.tableWidget.setHorizontalHeaderLabels( summary_df.columns.values.tolist()) self.tableWidget.setVerticalHeaderLabels( summary_df.index.values.tolist()) print(row, column) for i in range(row): for j in range(column): self.tableWidget.setItem( i, j, QTableWidgetItem(str(summary_df.iloc[i, j]))) self.tableWidget.resizeColumnsToContents() self.tableWidget.resizeRowsToContents() self.tableWidget.setEnabled(True) ################################################ ###### UI CHanges Methods def Open_Create(self): self.tabWidget.setCurrentIndex(0) def Open_Run(self): self.tabWidget.setCurrentIndex(3) def Open_Summary(self): self.tabWidget.setCurrentIndex(2) def open_Settings(self): self.tabWidget.setCurrentIndex(1) ################################################ ###### App Themes #### def Apply_DarkOrange_Style(self): style = open('./themes/darkorange.css', 'r') style = style.read() self.setStyleSheet(style) def Apply_QDark_Style(self): style = open('themes/qdark.css', 'r') style = style.read() self.setStyleSheet(style) def Apply_DarkGray_Style(self): style = open('themes/qdarkgray.css', 'r') style = style.read() self.setStyleSheet(style) def Apply_DarkBlue_Style(self): style = open('./themes/darkblue.css', 'r') style = style.read() self.setStyleSheet(style)
print(count_violations, count_violations.sum(), np.array(result_list).shape[0]) criticality = 0 for i in range(len(count_violations)): criticality += count_violations[i] * (127 - i) / 127 print(criticality / count_violations.sum()) category = 0 for i in range(len(count_violations)): if count_violations[i] > 0: category += 1 print(category, category / 128) sns.set_style("darkgrid") data_result = DataFrame(result_list) data_result.rename(columns={ 0: 'a', 1: 'b', 2: 'c', 3: 'd', 4: 'e', 5: 'f', 6: 'g' }, inplace=True) #注意这里0和1都不是字符串 data_result.dropna(axis=0, how='any') print(data_result.corr())
def make_train_csv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, time_step=3, word_count=20, stock_id_str=None, ranking_type='tfidf'): if cur == None or start_date == None or end_date == None or output_file == None or stock_id_str == None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if time_step < 0: time_step = 3 if word_count < 0: word_count = 20 if ranking_type not in ["tfidf", "textrank"]: ranking_type = "tfidf" output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) pd.DataFrame({"0":[], "1":[]}).to_csv(output_path, index=False) words = cls.getImportVocab(cur, count=20, ranking_type=ranking_type) word_count = len(words) for i in range(len(words)): words[i] = "'" + words[i] + "'" words_str = ",".join(words) del words cur.execute("SELECT count(*) as count FROM history WHERE stock_id in (%s) and date between '%s' and '%s' " % (stock_id_str, start_date, end_date)) count = cur.fetchall() count = count[0][0] stock_id_num = len(stock_id_str.split(",")) skip = 50 * stock_id_num slimit = 0 while slimit < count: cur.execute("SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id in (%s) and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id_str, start_date, end_date, 0 if slimit-stock_id_num < 0 else slimit-stock_id_num, skip if slimit-stock_id_num < 0 else skip+stock_id_num)) slimit += skip history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9])]) del history_tt sdate = str(history_t[0][9]) edate = str(history_t[-1][9]) sdate = datetime.datetime.strptime(sdate,'%Y-%m-%d') sdate = (sdate - datetime.timedelta( days=(time_step+1) )).strftime('%Y-%m-%d') cur.execute("SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc" % (words_str, sdate, edate)) bindex = cur.fetchall() bindex_t = [] bindex_vec = 0 cur_date = None if len(bindex) > 0: cur_date = str(bindex[0][2]) bix = [] bix_item = [cur_date] if len(bindex) > 0: for bi in bindex: if str(bi[2]) != cur_date: cur_date = str(bi[2]) bix.append(bix_item) bix_item = [cur_date] bix_temp = json.loads(bi[1]) bix_item.append(bix_temp['all']['0']) bix.append(bix_item) del bindex bindex = {} for k in range(1,len(bix)): b_t = [] for kk in range(1,len(bix[k])): if int(bix[k][kk]) != 0 and int(bix[k-1][kk]) != 0: b_t.append(str(np.round(float(100 * (int(bix[k][kk]) / int(bix[k-1][kk]) - 1)), 2))) else: b_t.append(str(0.00)) bindex[bix[k][0]] = b_t del bix for i in range(len(history_t)): history_t[i] += bindex[history_t[i][9]] history_temp = [] for h in zip(*history_t): history_temp.append(h) history = {'stock_id':history_temp[0], 'opening':history_temp[1], 'closing':history_temp[2], 'difference':history_temp[3], 'percentage_difference':history_temp[4], 'lowest':history_temp[5], 'highest':history_temp[6], 'volume':history_temp[7], 'amount':history_temp[8], 'date':history_temp[9]} for i in range(10, 10+word_count): history["word%s" % (i-9)] = history_temp[i] del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by = ['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) for i in history.index: history.loc[i, 'rate'] = str(np.round(float(history['rate'][i]), 2)) #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock, time_step): if cls.groupby_skip == False: cls.groupby_skip = True return None print ("正在处理的股票代码:%06s"%data_stock.name) word_key_list = [] for i in range(1,word_count+1): word_key_list.append("word%s" % i) x = word_key_list + ["opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount"] #提取输入S列(对应train_x) data_temp_x = data_stock[x] #提取输出列(对应train_y) data_temp_y = data_stock[["rate", "date", "stock_id"]] data_res = [] for i in range(time_step - 1, len(data_temp_x.index) - 1): data_res.append( data_temp_x.iloc[i - time_step + 1: i + 1].values.reshape(1, time_step * (8+word_count)).tolist() + data_temp_y.iloc[i + 1].values.reshape(1,3).tolist() ) if len(data_res) != 0: pd.DataFrame(data_res).to_csv(output_path, index=False, header=False, mode="a") g_stock = history.groupby(by = ["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock.apply(func_train_data, time_step = time_step)
def getStockRateDistributed(cls, cur=None, stock_ids_str=None, start_date=None, end_date=None): if cur == None or stock_ids_str == None or start_date == None or end_date == None: return None cur.execute( "SELECT stock_id, closing, date FROM history WHERE stock_id in (%s) and date between '%s' and '%s' order by stock_id, date " % (stock_ids_str, start_date, end_date)) history_t = cur.fetchall() history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'closing': history_temp[1], 'date': history_temp[2] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * ( g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) x_1 = [] x_2 = [] for i in range(-10, 1): x_1.append(i) for i in range(0, 11): x_2.append(i) y_stock = {} def func_train_data(date_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None y = [] x_1_length = len(x_1) x_2_length = len(x_2) for i in range(x_1_length + x_2_length): y.append(0) for k in date_stock['stock_id'].keys(): rate = date_stock['rate'][k] for k in range(len(x_1)): if x_1[k] == 0 and rate < x_1[k]: y[k] += 1 elif rate <= x_1[k]: y[k] += 1 for k in range(len(x_2)): if rate >= x_2[k]: y[x_1_length + k] += 1 y_stock[date_stock.name] = y print("正在处理的股票ID:%s" % date_stock.name) g_stock = history.groupby(by=["stock_id"]) cls.groupby_skip = False g_stock.apply(func_train_data) x = x_1 + x_2 fields = [['red', 'stock id '], ['blue', 'stock id '], ['yellow', 'stock id ']] for k in y_stock: all_num = y_stock[k][10] + y_stock[k][11] for i in range(len(x)): y_stock[k][i] = float('%.2f' % (float(y_stock[k][i]) / all_num)) # 画图 以折线图表示结果 plt.figure() i = 0 print(x) for k in y_stock: print(y_stock[k]) plt.plot(x, y_stock[k], color=fields[i][0], label='%s %s' % (fields[i][1], k)) i += 1 for a, b in zip(x, y_stock[k]): plt.text(a, b, b, ha='center', va='bottom', fontsize=7) plt.legend() # 显示图例 plt.title('rate distributed') plt.xlabel('rate division') plt.ylabel('percent') plt.show()
def strat_maLong_maShort( df=readYahoo("SPY"), maLongDays=10, maShortDays=3, closeCol="Close", highCol="High", lowCol="Low", openCol="Open", signOfTrade=1, printit=True, block=False, ): """ execute strategy which enters and exit based on Moving Average crossovers Example: from pystrats.state_strats import strat_maLong_maShort as ss dfretfinal = ss() #strat_maLong_maShort() print dfretfinal print dfretfinal['ret'].mean() """ close = np.array(df[closeCol]) high = np.array(df[highCol]) low = np.array(df[lowCol]) open = np.array(df[openCol]) date = np.array(df["Date"]) ma10 = rolling_mean(close, maLongDays) ma9 = rolling_mean(close, maLongDays - 1) ma3 = rolling_mean(close, maShortDays) ma2 = rolling_mean(close, maShortDays - 1) n = len(df) nl = n - 1 # pMa10 = dsInsert(ma10[0:nl],0,None) # pMa9 = dsInsert(ma9[0:nl],0,None) # pMa3 = dsInsert(ma3[0:nl],0,None) # pMa2 = dsInsert(ma2[0:nl],0,None) pMa10 = np.insert(ma10[0:nl], 0, None) pMa9 = np.insert(ma9[0:nl], 0, None) pMa3 = np.insert(ma3[0:nl], 0, None) pMa2 = np.insert(ma2[0:nl], 0, None) pClose = np.insert(close[0:nl], 0, None) pHigh = np.insert(high[0:nl], 0, None) pLow = np.insert(low[0:nl], 0, None) # initialize state vector state = np.array([1] * n) # loop start_i = maLongDays + 1 for i in range(start_i, n): if (pClose[i] < pMa10[i]) & (state[i - 1] == 1) & (high[i] > pMa9[i]): state[i] = 2 elif (state[i - 1] == 2) & (low[i] > pMa2[i]): state[i] = 2 elif (state[i - 1] == 2) & (low[i] <= pMa2[i]): state[i] = 1 pState = np.insert(state[0:nl], 0, 1) # create entry conditions # 1. initial entry (state 1 to state 2) e1_2 = np.array((pState == 1) & (state == 2)) e2_2 = np.array((pState == 2) & (state == 2)) e2_1 = np.array((pState == 2) & (state == 1)) dfret = DataFrame([date, pHigh, pLow, pClose, pMa10, pMa9, pMa3, pMa2]).T dfret.columns = ["Date", "pHigh", "pLow", "pClose", "pMa10", "pMa9", "pMa3", "pMa2"] # create daily entry prices dailyEntryPrices = np.array([0] * n) # default entry dailyEntryPrices = asb(dailyEntryPrices, pMa9, e1_2) useCloseOnEntry = e1_2 & (low > pMa9) dailyEntryPrices = asb(dailyEntryPrices, close, useCloseOnEntry) dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_2) dailyEntryPrices = asb(dailyEntryPrices, pClose, e2_1) dfret["entry"] = dailyEntryPrices # create DAILY settle prices, which are either 0 or the Close # dfret$Close <- close dailySettlePrices = np.array([0] * n) dailySettlePrices = asb(dailySettlePrices, close, e1_2) # <- close[w1_2] dailySettlePrices = asb(dailySettlePrices, close, e2_2) # dailySettlePrices[w2_2] <- close[w2_2] dailySettlePrices = asb(dailySettlePrices, pMa2, e2_1) # dailySettlePrices[w2_1] <- pMa2[w2_1] # adjust for situations where the high is below the pMa2, so you get out at the close useCloseOnExit = e2_1 & (high < pMa2) dailySettlePrices = asb( dailySettlePrices, close, useCloseOnExit ) # dailySettlePrices[useCloseOnExit] <- close[useCloseOnExit] dfret["exit"] = dailySettlePrices dfret["ret"] = dfret["exit"] / dfret["entry"] - 1 dfret["ret"].fillna(0) dfretfinal = dfret.dropna(0) # dfretfinal <- dfret[-badrows(dfret),] if printit: retDf = DataFrame({"Date": dfretfinal["Date"], "ret": dfretfinal["ret"]}) returnsPerformance(retDf, block=block) return dfretfinal
def makeBindexOriginCsv(cls, cur=None, words=None, start_date=None, end_date=None, day_num=1, basic_path=None, output_file=None, stock_id=None): #初始化源文件路径和存储文件路径 if cur is None or words is None or start_date is None or end_date is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) if output_file is None: output_file = "bindex_data.csv" output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) #清空接收路径下的文件,初始化列名 pd.DataFrame({ "0": [], "1": [] }).to_csv(output_path, index=False, encoding="utf-8") start_date = datetime.datetime.strptime(start_date, '%Y-%m-%d') end_date = datetime.datetime.strptime(end_date, '%Y-%m-%d') for i in range(len(words)): words[i] = "'" + words[i] + "'" words_str = ",".join(words) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] deviation = 2 skip = 100 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id,closing,date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - deviation - day_num < 0 else slimit - deviation - day_num, skip if slimit - deviation - day_num < 0 else skip + deviation + day_num)) history_t = cur.fetchall() sdate = str(history_t[0][2]) edate = str(history_t[-1][2]) history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'closing': history_temp[1], 'date': history_temp[2] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' cur.execute( "SELECT b.vocab_id, b.bindex, b.date FROM vocab v left join baidu_index b on v.id = b.vocab_id WHERE v.word in (%s) and b.date between '%s' and '%s' order by date, vocab_id asc" % (words_str, sdate, edate)) bindex = cur.fetchall() news_date = {} for k in history['date'].keys(): if (k - deviation - day_num + 1) in history['date']: news_date[str(history['date'][k])] = [ str(history['date'][k - deviation - day_num + 1]), str(history['date'][k - deviation]) ] bindex_t = [] bindex_vec = 0 cur_date = None if len(bindex) > 0: cur_date = str(bindex[0][2]) bix = [] for bi in bindex: if str(bi[2]) != cur_date: bindex_t.append([bix, cur_date]) cur_date = str(bi[2]) bix = [] bix_temp = json.loads(bi[1]) bix_temp = sorted(bix_temp.items(), key=lambda v: v[0]) for k, b in bix_temp: bix_list = sorted(b.items(), key=lambda v: v[0]) for kk, bb in bix_list: bix.append(bb) if bindex_vec == 0: bindex_vec = len(bix) bindex_t.append([bix, cur_date]) del bindex bindex_by_date = {} for k in range(1, len(bindex_t)): b_t = [] for kk in range(len(bindex_t[k][0])): if int(bindex_t[k][0][kk]) != 0 and int( bindex_t[k - 1][0][kk]) != 0: b_t.append( str( np.round( float(100 * (int(bindex_t[k][0][kk]) / int(bindex_t[k - 1][0][kk]) - 1)), 2))) else: b_t.append(str(0.00)) bindex_by_date[bindex_t[k][1]] = b_t del bindex_t def func_train_data(date_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None date = str(date_stock.name) if date not in news_date: return sdate = datetime.datetime.strptime(news_date[date][0], '%Y-%m-%d') edate = datetime.datetime.strptime(news_date[date][1], '%Y-%m-%d') bindexs = [] while sdate <= edate: cur_date = sdate.strftime('%Y-%m-%d') sdate += datetime.timedelta(days=1) if cur_date not in bindex_by_date: print("%s error" % cur_date) exit() else: bindexs += bindex_by_date[cur_date] data = [] for k in date_stock['stock_id'].keys(): data.append([(np.array(bindexs).reshape( int(len(bindexs) / bindex_vec), bindex_vec)).tolist(), [ str(np.round(float(history['rate'][k]), 2)), str(date_stock['date'][k]), str(date_stock['stock_id'][k]) ]]) print("正在处理的日期:%s" % date_stock.name) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", encoding="utf-8") g_stock = history.groupby(by=["date"]) cls.groupby_skip = False g_stock.apply(func_train_data) slimit += skip
def on_data(context: Context): if datetime.datetime.strftime(context.now, '%Y-%m-%d') not in context.month_begin: return # 获取沪深300指数数据 price = get_reg_kdata(reg_idx=context.reg_kdata[0], length=1, fill_up=True, df=True) index = get_reg_kdata(reg_idx=context.reg_kdata[0], target_indices=300, length=context.long + context.Len - 1, fill_up=False, df=True) factor = get_reg_factor(reg_idx=context.reg_factor[0], target_indices=(), length=5, df=True) if price['close'].isna().any(): return """ 计算沪深300指数的长短期波动率,以长期波动率为门限,若短期波动率突破, 则降低股票池持仓为50% """ index['ret'] = index.groupby('target_idx')['close'].apply( lambda x: (x - x.shift()) / x.shift()) index = index.fillna(0) # 将NaN换为0 ret = index.ret.values.astype(float) StdDev = talib.STDDEV(ret, timeperiod=context.Len, nbdev=1) StdDev = DataFrame({"a": StdDev}) StdDev = StdDev.dropna() std = StdDev['a'].tolist() std_short = np.mean(std[-14:]) bound = np.mean(std) # factor的注册频率默认为日频 factor = factor.dropna(subset=['date']) # 删除非法日期 factor['code'] = factor['target_idx'].apply( lambda x: context.target_list[x]) # 将用0,1,2,3等表示的股票换成对应的股票代码 factor['month'] = factor['date'].apply(lambda x: int( str(x)[0:4] + str(x)[5:7])) # 增加month列,2017-01,2017-02,只记录月份,不记录日时分秒 factor_name = factor['factor'].drop_duplicates().tolist() # 以列表的形式取出因子名称 # 将factor按['target_idx','month','factor']分组,分别取每组的最后一行 # 即取出各股票每个月末的所有因子值 factor_month = factor.groupby( ['target_idx', 'month', 'factor']).apply(lambda x: x.iloc[-1])[['date', 'value']].reset_index() # 添加所有因子名作为新的列 factor_month1 = factor_month.groupby(['target_idx', 'month']).apply(deal).reset_index() """ 取最后一个月(当前时间) """ test = factor_month1.groupby('target_idx').apply(lambda x: x.iloc[-1]) scaler = StandardScaler() # 标准化 X_test = test[factor_name] X_test = X_test.fillna(0).values #X_test=scaler.fit_transform(X_test) # 因子标准化 X_test = scaler.fit_transform(X_test) # 因子标准化 # 预测 model = pickle.load(open("XGboost_ret0.06_5factor.pickle.dat", "rb")) y_pred = model.predict(X_test) y_pred1 = pd.DataFrame(y_pred, columns=['label']) idx_list = list(y_pred1[y_pred1['label'] == 1].index) print(idx_list) positions = context.account().positions if len(idx_list) == 0: # 没有一只股票在标的池,则卖出全部股票 for target_idx in positions.loc[positions['volume_long'] > 0, 'target_idx'].astype(int): volume = positions['volume_long'].iloc[target_idx] order_volume(account_idx=0, target_idx=target_idx, volume=int(volume), side=2, position_effect=2, order_type=2, price=0) else: positions = context.account().positions # 平不在标的池的股票 for target_idx in positions.target_idx.astype(int): if target_idx not in idx_list: if positions['volume_long'].iloc[target_idx] > 0: volume = positions['volume_long'].iloc[target_idx] order_volume(account_idx=0, target_idx=target_idx, volume=int(volume), side=2, position_effect=2, order_type=2, price=0) print("平不在标的池的股票" + str(target_idx)) # 根据波动率进行风险控制 if std_short > bound: positions = context.account().positions for target_idx in positions.loc[positions['volume_long'] > 0, 'target_idx'].astype(int): volume = positions['volume_long'].iloc[target_idx] order_volume(account_idx=0, target_idx=target_idx, volume=int(volume * 0.5), side=2, position_effect=2, order_type=2, price=0) print("风险控制" + str(target_idx)) # 获取股票的权重 positions = context.account().positions percent_b = context.ratio / len(idx_list) # print(percent_b) # 买在标的池中的股票 for target_idx in idx_list: order_target_percent(account_idx=0, target_idx=target_idx, target_percent=percent_b * 0.5, side=1, order_type=2) print(positions.loc[positions['volume_long'] > 0, 'code'].tolist()) else: # 获取股票的权重 positions = context.account().positions percent_b = context.ratio / len(idx_list) # print(percent_b) # 买在标的池中的股票 for target_idx in idx_list: order_target_percent(account_idx=0, target_idx=target_idx, target_percent=percent_b * 0.5, side=1, order_type=2) print(positions.loc[positions['volume_long'] > 0, 'code'].tolist())
def makeTextOriginCsv(cls, cur=None, start_date=None, end_date=None, day_num=1, basic_path=None, input_file=None, output_file=None, stock_id=None, rewrite=True): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or input_file is None or output_file is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) input_path = os.path.join(basic_path, input_file) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) ''' ''' cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] if rewrite == True: pd.DataFrame({"0": [], "1": []}).to_csv(output_path, index=False) deviation = 2 skip = 50 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id,closing,date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - deviation - day_num < 0 else slimit - deviation - day_num, skip if slimit - deviation - day_num < 0 else skip + deviation + day_num)) history_t = cur.fetchall() sdate = str(history_t[0][2]) edate = str(history_t[-1][2]) history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'closing': history_temp[1], 'date': history_temp[2] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' cur.execute( "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' GROUP BY time order by time " % (sdate, edate)) news_temp = cur.fetchall() news_date = {} for k in history['date'].keys(): if (k - deviation - day_num + 1) in history['date']: news_date[str(history['date'][k])] = [ str(history['date'][k - deviation - day_num + 1]), str(history['date'][k - deviation]) ] news_by_date = {} news_by_id = {} for n in news_temp: news_by_date[str(n[1])] = n[0].split(",") for nid in news_by_date[str(n[1])]: news_by_id[nid] = '' del news_temp nid_len = len(news_by_id) reader = pd.read_csv(input_path, chunksize=1000) for sentences in reader: for k in sentences['1'].keys(): nid = str(sentences['0'][k]) if nid in news_by_id and news_by_id[nid] == '': news_by_id[nid] = str(sentences['1'][k]).split(" ") nid_len -= 1 if nid_len <= 0: break reader.close() del reader, sentences def func_train_data(date_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None date = str(date_stock.name) if date not in news_date: return sdate = datetime.datetime.strptime(news_date[date][0], '%Y-%m-%d') edate = datetime.datetime.strptime(news_date[date][1], '%Y-%m-%d') words = [] while sdate <= edate: cur_date = sdate.strftime('%Y-%m-%d') sdate += datetime.timedelta(days=1) if cur_date not in news_by_date: print("%s error" % cur_date) return None for i in news_by_date[cur_date]: words += news_by_id[i] data = [] for k in date_stock['stock_id'].keys(): data.append([[" ".join(words)], [ str(np.round(float(history['rate'][k]), 2)), str(date_stock['date'][k]), str(date_stock['stock_id'][k]) ]]) print("正在处理的日期:%s" % date_stock.name) pd.DataFrame(data).to_csv(output_path, index=False, header=False, mode="a", encoding="utf-8") g_stock = history.groupby(by=["date"]) cls.groupby_skip = False g_stock.apply(func_train_data) slimit += skip
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sortlevel(level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list(zip(*[lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1])])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_labels = sorted(set(this.columns.labels[-1])) level_vals_used = level_vals[level_labels] levsize = len(level_labels) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) slice_len = loc.stop - loc.start # can make more efficient? if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.ix[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: value_slice = this.ix[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(frame.columns.levels[level_num]) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how="all") return result
def makeTrendStockOriginCsv(cls, cur=None, start_date=None, end_date=None, day_num=3, basic_path=None, stock_id=None, word_trend_file=None, news_file=None, output_file=None): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or stock_id is None or output_file is None or word_trend_file is None or news_file is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) word_trend_path = os.path.join(basic_path, word_trend_file) news_path = os.path.join(basic_path, news_file) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) #清空接收路径下的文件,初始化列名 pd.DataFrame({ "0": [], "1": [] }).to_csv(output_path, index=False, encoding="utf-8") word_trend = {} word_trend_temp = pd.read_csv(word_trend_path) for k in word_trend_temp["0"].keys(): word_trend[word_trend_temp["0"][k]] = [ word_trend_temp["1"][k], word_trend_temp["2"][k] ] p_up = word_trend['total_words'][0] / (word_trend['total_words'][0] + word_trend['total_words'][1]) p_down = word_trend['total_words'][1] / (word_trend['total_words'][0] + word_trend['total_words'][1]) cur.execute( "SELECT count(*) as count FROM history WHERE stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) count = cur.fetchall() count = count[0][0] deviation = 2 skip = 100 slimit = 0 while slimit < count: cur.execute( "SELECT stock_id, opening, closing, difference, percentage_difference, lowest, highest, volume, amount, date FROM history WHERE stock_id = '%s' and date between '%s' and '%s' order by date asc, stock_id asc limit %d,%d " % (stock_id, start_date, end_date, 0 if slimit - day_num - deviation < 0 else slimit - day_num - deviation, skip if slimit - day_num - deviation < 0 else skip + day_num + deviation)) history_tt = cur.fetchall() history_t = [] for h in history_tt: history_t.append([ int(h[0]), float(h[1]), float(h[2]), float(h[3]), float(h[4]), float(h[5]), float(h[6]), float(h[7]), float(h[8]), str(h[9]) ]) del history_tt history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'opening': history_temp[1], 'closing': history_temp[2], 'difference': history_temp[3], 'percentage_difference': history_temp[4], 'lowest': history_temp[5], 'highest': history_temp[6], 'volume': history_temp[7], 'amount': history_temp[8], 'date': history_temp[9] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * (g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) ''' ''' sdate = str(history['date'][history['date'].keys()[0]]) edate = str(history['date'][history['date'].keys()[-1]]) cur.execute( "SELECT GROUP_CONCAT(id SEPARATOR ','), time FROM news WHERE time between '%s' and '%s' group by time" % (sdate, edate)) news_temp = cur.fetchall() news_by_date = {} news_by_id = {} for n in news_temp: news_by_date[str(n[1])] = n[0].split(",") for nid in news_by_date[str(n[1])]: news_by_id[nid] = None del news_temp nid_len = len(news_by_id) reader = pd.read_csv(news_path, chunksize=1000) for sentences in reader: if nid_len > 0: for k in sentences['1'].keys(): nid = str(sentences['0'][k]) if nid in news_by_id and news_by_id[nid] == None: news_by_id[nid] = str(sentences['1'][k]).split(" ") wp_up = p_up wp_down = p_down for w in news_by_id[nid]: if w not in word_trend: wp_up *= (1 / word_trend['total_words'][0]) wp_down *= (1 / word_trend['total_words'][1]) else: if word_trend[w][0] > 0: wp_up *= word_trend[w][0] else: wp_up *= (1 / word_trend['total_words'][0]) if word_trend[w][1] > 0: wp_down *= word_trend[w][1] else: wp_down *= ( 1 / word_trend['total_words'][1]) while True: if wp_up < 1 and wp_down < 1: wp_up *= 10 wp_down *= 10 else: break news_by_id[nid] = [ wp_up / (wp_up + wp_down), -1 * wp_down / (wp_up + wp_down) ] nid_len -= 1 if nid_len <= 0: break else: break reader.close() del reader, sentences for d in news_by_date: sumn = [0, 0] for nid in news_by_date[d]: sumn[0] += news_by_id[nid][0] sumn[1] += news_by_id[nid][1] le = len(news_by_date[d]) if le > 0: sumn[0] /= le sumn[1] /= le news_by_date[d] = sumn print(d) history['news_pos_num'] = 0 history['news_neg_num'] = 0 for i in history.index: history.loc[i, 'rate'] = str( np.round(float(history['rate'][i]), 2)) if str(history['date'][i]) in news_by_date: history.loc[i, 'news_pos_num'] = str( np.round( float(news_by_date[str(history['date'][i])][0]), 2)) history.loc[i, 'news_neg_num'] = str( np.round( float(news_by_date[str(history['date'][i])][1]), 2)) else: history.loc[i, 'news_pos_num'] = "0" history.loc[i, 'news_neg_num'] = "0" #将经过标准化的数据处理成训练集和测试集可接受的形式 def func_train_data(data_stock): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_stock.name) data_temp_x = data_stock[[ "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "news_pos_num", "news_neg_num" ]] data_temp_y = data_stock[["rate", "date", "stock_id"]] data_res = [] for i in range(day_num - 1, len(data_temp_x.index) - deviation): data_res.append([ data_temp_x.iloc[i - day_num + 1:i + 1].values.reshape( day_num, 10).tolist() ] + data_temp_y.iloc[i + deviation].values.reshape( 1, 3).tolist()) if len(data_res) != 0: pd.DataFrame(data_res).to_csv(output_path, index=False, header=False, mode="a") g_stock_num = history.groupby(by=["stock_id"]) cls.groupby_skip = False g_stock_num.apply(func_train_data) slimit += skip
def _stack_multi_columns(frame, level_num=-1, dropna=True): def _convert_level_number(level_num, columns): """ Logic for converting the level number to something we can safely pass to swaplevel: We generally want to convert the level number into a level name, except when columns do not have names, in which case we must leave as a level number """ if level_num in columns.names: return columns.names[level_num] else: if columns.names[level_num] is None: return level_num else: return columns.names[level_num] this = frame.copy() # this makes life much simpler if level_num != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level_num, frame.columns.nlevels - 1): # Need to check if the ints conflict with level names lev1 = _convert_level_number(i, roll_columns) lev2 = _convert_level_number(i + 1, roll_columns) roll_columns = roll_columns.swaplevel(lev1, lev2) this.columns = roll_columns if not this.columns.is_lexsorted(): # Workaround the edge case where 0 is one of the column names, # which interferes with trying to sort based on the first # level level_to_sort = _convert_level_number(0, this.columns) this = this.sort_index(level=level_to_sort, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = list( zip(*[ lev.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1]) ])) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] level_labels = sorted(set(this.columns.labels[-1])) level_vals_used = level_vals[level_labels] levsize = len(level_labels) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) # can make more efficient? # we almost always return a slice # but if unsorted can get a boolean # indexer if not isinstance(loc, slice): slice_len = len(loc) else: slice_len = loc.stop - loc.start if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.loc[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals_used).values else: if frame._is_mixed_type: value_slice = this.loc[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns.difference(drop_cols) N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(level_vals) new_labels.append(np.tile(level_labels, N)) new_names.append(frame.columns.names[level_num]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names, verify_integrity=False) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def getHighRateDate(self, cur=None, stock_id_str=None, start_date=None, end_date=None, rate=2): if cur == None or stock_id_str == None or start_date == None or end_date == None or rate < 0: return None cur.execute( "SELECT stock_id, closing, date FROM history WHERE stock_id in (%s) and date between '%s' and '%s' order by stock_id, date " % (stock_id_str, start_date, end_date)) history_t = cur.fetchall() history_temp = [] for h in zip(*history_t): history_temp.append(h) history = { 'stock_id': history_temp[0], 'closing': history_temp[1], 'date': history_temp[2] } del history_t, history_temp history = DataFrame(history) g_history = history.groupby(by=['stock_id']) #0.01 -> 1 % 保留2位小数 history['rate'] = 100 * ( g_history.shift(0)["closing"] / g_history.shift(1)["closing"] - 1) history.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True) left_shift = 2 test_part_array = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] lens = len(history['date']) high_rate_dates = [] for i in range(len(test_part_array) - 1): index = 0 start = int(lens * test_part_array[i]) end = int(lens * test_part_array[i + 1]) for k in history['rate'].keys(): if index == start: start = str(history['date'][k]) if index == end: end = str(history['date'][k]) break index += 1 stocks = {} news_date = {} for k in history['rate'].keys(): date = str(history['date'][k]) if date not in stocks and ( date < str(start) or date > str(end)) and ( k - left_shift) in history['date']: stocks[date] = {} news_date[date] = str(history['date'][k - left_shift]) else: continue stocks[date][history['stock_id'][k]] = history['rate'][k] high_rate_date = {"up": [], "down": []} for d in stocks: if len(stocks[d]) == 0: continue up = down = True for k in stocks[d]: if stocks[d][k] >= 0: down = False if stocks[d][k] < rate: up = False else: up = False if stocks[d][k] > -1 * rate: down = False if up == True: high_rate_date['up'].append([ str(d), news_date[d], float(str(np.round(float(stocks[d][k]), 2))) ]) if down == True: high_rate_date['down'].append([ str(d), news_date[d], float(str(np.round(float(stocks[d][k]), 2))) ]) high_rate_dates.append(high_rate_date) return high_rate_dates
def _stack_multi_columns(frame, level=-1, dropna=True): this = frame.copy() # this makes life much simpler if level != frame.columns.nlevels - 1: # roll levels to put selected level at end roll_columns = this.columns for i in range(level, frame.columns.nlevels - 1): roll_columns = roll_columns.swaplevel(i, i + 1) this.columns = roll_columns if not this.columns.is_lexsorted(): this = this.sortlevel(0, axis=1) # tuple list excluding level for grouping columns if len(frame.columns.levels) > 2: tuples = zip(*[lev.values.take(lab) for lev, lab in zip(this.columns.levels[:-1], this.columns.labels[:-1])]) unique_groups = [key for key, _ in itertools.groupby(tuples)] new_names = this.columns.names[:-1] new_columns = MultiIndex.from_tuples(unique_groups, names=new_names) else: new_columns = unique_groups = this.columns.levels[0] # time to ravel the values new_data = {} level_vals = this.columns.levels[-1] levsize = len(level_vals) drop_cols = [] for key in unique_groups: loc = this.columns.get_loc(key) slice_len = loc.stop - loc.start # can make more efficient? if slice_len == 0: drop_cols.append(key) continue elif slice_len != levsize: chunk = this.ix[:, this.columns[loc]] chunk.columns = level_vals.take(chunk.columns.labels[-1]) value_slice = chunk.reindex(columns=level_vals).values else: if frame._is_mixed_type: value_slice = this.ix[:, this.columns[loc]].values else: value_slice = this.values[:, loc] new_data[key] = value_slice.ravel() if len(drop_cols) > 0: new_columns = new_columns - drop_cols N = len(this) if isinstance(this.index, MultiIndex): new_levels = list(this.index.levels) new_names = list(this.index.names) new_labels = [lab.repeat(levsize) for lab in this.index.labels] else: new_levels = [this.index] new_labels = [np.arange(N).repeat(levsize)] new_names = [this.index.name] # something better? new_levels.append(frame.columns.levels[level]) new_labels.append(np.tile(np.arange(levsize), N)) new_names.append(frame.columns.names[level]) new_index = MultiIndex(levels=new_levels, labels=new_labels, names=new_names) result = DataFrame(new_data, index=new_index, columns=new_columns) # more efficient way to go about this? can do the whole masking biz but # will only save a small amount of time... if dropna: result = result.dropna(axis=0, how='all') return result
def makeOriginDataCsv(cls, cur=None, start_date=None, end_date=None, basic_path=None, output_file=None, stock_id=None): #初始化源文件路径和存储文件路径 if cur is None or start_date is None or end_date is None or output_file is None or stock_id is None: return None if basic_path is None: basic_path = os.path.dirname(os.path.abspath(__file__)) output_path = os.path.join(basic_path, output_file) VTool.makeDirs(files=[output_path]) data = cur.execute( "select id, stock_id, date, opening, closing, difference, percentage_difference, lowest, highest, volume, amount from history where stock_id = '%s' and date between '%s' and '%s' " % (stock_id, start_date, end_date)) data = cur.fetchall() if len(data) == 0: return None res = [] for d in data: res.append([ int(d[0]), int(d[1]), str(d[2]), float(d[3]), float(d[4]), float(d[5]), float(d[6]), float(d[7]), float(d[8]), float(d[9]), float(d[10]) ]) new_data = [] for d in zip(*res): new_data.append(d) origin_data = { 'id': new_data[0], 'stock_id': new_data[1], 'date': new_data[2], 'opening': new_data[3], 'closing': new_data[4], 'difference': new_data[5], 'percentage_difference': new_data[6], 'lowest': new_data[7], 'highest': new_data[8], 'volume': new_data[9], 'amount': new_data[10] } #读取原始数据,只保留需要使用的列 total_data = DataFrame(origin_data) total_data.sort_values(by=['stock_id', 'date'], inplace=True) #根据股票代码分组 g_stock_num = total_data.groupby(by=["stock_id"]) total_data["rate"] = 100 * (g_stock_num.shift(0)["closing"] / g_stock_num.shift(1)["closing"] - 1) for i in total_data.index: total_data.loc[i, 'rate'] = str( np.round(float(total_data['rate'][i]), 2)) #重新调整列的顺序,为接下来处理成输入、输出形式做准备 columns = [ "stock_id", "date", "opening", "closing", "difference", "percentage_difference", "lowest", "highest", "volume", "amount", "rate" ] total_data = total_data[columns] def func_train_data(data_one_stock_num): if cls.groupby_skip == False: cls.groupby_skip = True return None print("正在处理的股票代码:%06s" % data_one_stock_num.name) data = { "stock_id": [], "date": [], "opening": [], "closing": [], "difference": [], "percentage_difference": [], "lowest": [], "highest": [], "volume": [], "amount": [], "rate": [] } for i in range(len(data_one_stock_num.index) - 1): for k in data: data[k].append(data_one_stock_num.iloc[i][k]) pd.DataFrame(data).to_csv(output_path, index=False, columns=columns) total_data1 = total_data.dropna() total_data2 = total_data1.drop( total_data1[(total_data1.rate == 'nan')].index) g_stock_num = total_data2.groupby(by=["stock_id"]) #清空接收路径下的文件,初始化列名 cls.groupby_skip = False g_stock_num.apply(func_train_data)