trainset_folders, pred_folders = choose_folder.parser() #---------------------------------- # Prepare the Matrix #---------------------------------- from pretreat_data import prep_matrix, format_matrix from time_series import logic, ts_format # parse the stocks to get the raw stock matrix, ranking, selecting ts_stock_matrix, selected_symbols = \ prep_matrix.parser_for_stock(trainset_folders, start=settings.S_TRAIN_DATE, end=settings.Time_Slot) # get the hs300 index ts_hs300_matrix = prep_matrix.parser_for_hs300(start=settings.S_TRAIN_DATE, end=settings.Time_Slot) # bind with stock to raise the matrix for training ts_matrix = logic.bind(ts_stock_matrix, ts_hs300_matrix) # get the title ts_matrix_title = ['Timeline'] + selected_symbols + ['HS300'] # make integrity ts_matrix, ts_matrix_title = format_matrix.make_integrity(ts_matrix, ts_matrix_title) np.save('../tmp_data/'+settings.TIME_SLOT+'_matrix_title.npy', ts_matrix_title) np.save('../tmp_data/'+settings.TIME_SLOT+'_trainset.npy', ts_matrix) #---------------------------------- # NNLS fitting #---------------------------------- from modeling import nnls_parser obj_nnls = nnls_parser.BASIC_NNLS(ts_matrix, ts_matrix_title)
def parser_for_stock(data_folders, start=settings.S_TRAIN_DATE, end=settings.Time_Slot, save_to_dict=False): ''' from the data folder, get the truncate ts data and the average spread/vol ''' ts_data_dict, score_dict = {}, {} stock_matrix = [] c = 0 for symbol in settings.SYMBOLS: print c, symbol tmp_timeline, tmp_ave, tmp_spread, tmp_vol = [], [], [], [] for folder in data_folders: try: filename = [os.path.join(folder, filename) for filename in os.listdir(folder) if symbol in filename][0] except IndexError: print "Warning: empty file of %s in the data folder %s" %(symbol, folder) continue # use txt reader test_t = datetime.strptime( open(filename).readline().split(',')[0][:15], "%Y%m%d %H%M%S" ) # initial test_t with open(filename) as f: for line in f: tmp_line = line.split(',') t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S") b1, s1, v = float(tmp_line[2]), float(tmp_line[12]), float(tmp_line[-1]) ave_price = (b1 + s1)/2. spread = (b1 - s1)/2. tmp_timeline.append(t) tmp_ave.append(ave_price) tmp_spread.append(spread) #print t, test_t if t.day > test_t.day: tmp_vol.append(cache_v) else: test_t = t; cache_v = v tmp_vol.append(cache_v) pass pass # truncate the raw data to the time series data tmp_data = np.vstack((tmp_timeline, tmp_ave)).T # rolling the data # NOTE: the start and end used here is meaningless, BUG, Lance, 2013/10/20 tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1]) # save the numpy array to the dict if request if save_to_dict: ts_data_dict[symbol] = tmp_data # convert to the tuple, make it faster for binding tmp_data = [(a[0], tuple(a[1:])) for a in tmp_data] if c == 0: stock_matrix = tmp_data else: if tmp_data != []: stock_matrix = logic.bind(stock_matrix, tmp_data) else: stock_matrix = [(i,j+('NA',)) for i, j in stock_matrix] if len(tmp_data) != 0: # here will use the redundant values of spread and # total prices as the ranking criterian, 2013/10/19 ave_spread = np.average(tmp_spread) ave_vol = np.average(tmp_vol) score_dict[symbol] = (ave_spread, ave_vol) else: score_dict[symbol] = (None, None) c = c+1 # selecting symbols and prepare the matrix selected_index = naive_ranking.naive_ranking(score_dict) raw_stock_matrix = truncate(start, end, stock_matrix) stock_matrix = [] for t, s in raw_stock_matrix: tmp_list = [] for i in selected_index: tmp_list.append(s[i]) stock_matrix.append((t, tuple(tmp_list))) matrix_title = [] for i in selected_index: matrix_title.append(settings.SYMBOLS[i]) return stock_matrix, matrix_title