#---------------------------------- # NNLS fitting #---------------------------------- from modeling import nnls_parser obj_nnls = nnls_parser.BASIC_NNLS(ts_matrix, ts_matrix_title) fout = open('result_r2.dat', 'a') fout.write("%s\t%.6f\n" %(settings.TIME_SLOT, obj_nnls.fitted_r2)) fout.close() np.save('../tmp_data/' + settings.TIME_SLOT+'_obj.npy', obj_nnls) if pred_folders is not None: # now have the new selected_symbols selected_symbols = ts_matrix_title[1:-1] # use the selected_symbols to get the prediction matrix pr_stock_matrix = prep_matrix.parser_for_prediction(pred_folders, selected_symbols, start=settings.Time_Slot, end=settings.E_PRED_DATE) pr_hs300_matrix = prep_matrix.parser_for_hs300(start=settings.Time_Slot, end=settings.E_PRED_DATE) pr_matrix = ts_format.ts2npy(logic.merge(pr_stock_matrix, pr_hs300_matrix)) np.save('../tmp_data/'+settings.TIME_SLOT+'_pred.npy', pr_matrix) obj_nnls.predict(pr_matrix) def test(): #part2() import cProfile cProfile.run('part2()')
def parser_for_prediction(data_folders, selected_symbols, start=settings.S_TRAIN_DATE, end=settings.Time_Slot, save_to_dict=False): ''' from the data folder, get the truncate ts data and the average spread/vol ''' ts_data_dict, score_dict = {}, {} stock_matrix = [] c = 0 for symbol in selected_symbols: print c, symbol tmp_timeline, tmp_ave, tmp_spread, tmp_vol = [], [], [], [] for folder in data_folders: try: filename = [os.path.join(folder, filename) for filename in os.listdir(folder) if symbol in filename][0] except IndexError: print "Warning: empty file of %s in the data folder %s" %(symbol, folder) continue # use txt reader test_t = datetime.strptime( open(filename).readline().split(',')[0][:15], "%Y%m%d %H%M%S" ) # initial test_t with open(filename) as f: for line in f: tmp_line = line.split(',') t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S") b1, s1, v = float(tmp_line[2]), float(tmp_line[12]), float(tmp_line[-1]) ave_price = (b1 + s1)/2. spread = (b1 - s1)/2. tmp_timeline.append(t) tmp_ave.append(ave_price) tmp_spread.append(spread) #print t, test_t if t.day > test_t.day: tmp_vol.append(cache_v) else: test_t = t; cache_v = v tmp_vol.append(cache_v) pass pass # truncate the raw data to the time series data tmp_data = np.vstack((tmp_timeline, tmp_ave)).T # rolling the data # NOTE: the start and end used here is meaningless, BUG, Lance, 2013/10/20 tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1]) # save the numpy array to the dict if request if save_to_dict: ts_data_dict[symbol] = tmp_data # convert to the tuple, make it faster for binding tmp_data = [(a[0], tuple(a[1:])) for a in tmp_data] if c == 0: stock_matrix = tmp_data else: stock_matrix = logic.merge(stock_matrix, tmp_data) c = c+1 stock_matrix = truncate(start, end, stock_matrix) return stock_matrix