Ejemplo n.º 1
0
def parser_for_hs300(start=settings.S_TRAIN_DATE, end=settings.Time_Slot):
	'''
	from the data folder, get the truncate ts data and the average spread/vol
	'''

	data_folder = os.path.join(settings.TXT_DATA_PATH, 'hs300')
	from pretreat_data.choose_folder import datetime2month
	months = datetime2month(start, end)
	month2name = lambda x: 'MarketData_000300_SH_' + x + '.csv'
	filenames = sorted([month2name(mon) for mon in months])
	filenames = [os.path.join(data_folder, fn) for fn in filenames]

	tmp_data = []
	for fn in filenames:
		if not os.path.isfile(fn):
			print 'Warning: %s does not existing, will skip it' %fn
			continue
		else:
			with open(fn) as f:
				for line in f:
					tmp_line = line.split(',')
					t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S")
					index_value = float(tmp_line[1])
					tmp_data.append((t, index_value))
	tmp_data = np.asarray(tmp_data)
	tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1])

	hs300_matrix = [(i, (j,)) for i, j in tmp_data]
	hs300_matrix = truncate(start, end, hs300_matrix)

	return hs300_matrix
Ejemplo n.º 2
0
def parser_for_stock(data_folders, start=settings.S_TRAIN_DATE, end=settings.Time_Slot,
			save_to_dict=False):
	'''
	from the data folder, get the truncate ts data and the average spread/vol
	'''

	ts_data_dict, score_dict = {}, {}
	stock_matrix = []

	c = 0
	for symbol in settings.SYMBOLS:
		print c, symbol
		tmp_timeline, tmp_ave, tmp_spread, tmp_vol = [], [], [], []
		for folder in data_folders:
			try:
				filename = [os.path.join(folder, filename) 
						for filename in os.listdir(folder) 
						if symbol in filename][0]
			except IndexError:
				print "Warning: empty file of %s in the data folder %s" %(symbol, folder)
				continue

			# use txt reader
			test_t = datetime.strptime(
				open(filename).readline().split(',')[0][:15], "%Y%m%d %H%M%S"
				) # initial test_t
			with open(filename) as f:
				for line in f:
					tmp_line = line.split(',')
					t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S")
					b1, s1, v = float(tmp_line[2]), float(tmp_line[12]), float(tmp_line[-1]) 
					ave_price = (b1 + s1)/2.
					spread = (b1 - s1)/2.
					tmp_timeline.append(t)
					tmp_ave.append(ave_price)
					tmp_spread.append(spread)
					#print t, test_t
					if t.day > test_t.day:
						tmp_vol.append(cache_v)
					else:
						test_t = t; cache_v = v
				tmp_vol.append(cache_v)
				pass
			pass

		# truncate the raw data to the time series data
		tmp_data = np.vstack((tmp_timeline, tmp_ave)).T
		# rolling the data
		# NOTE: the start and end used here is meaningless, BUG, Lance, 2013/10/20
		tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1])

		# save the numpy array to the dict if request
		if save_to_dict:
			ts_data_dict[symbol] = tmp_data

		# convert to the tuple, make it faster for binding
		tmp_data = [(a[0], tuple(a[1:])) for a in tmp_data] 

		if c == 0:
			stock_matrix = tmp_data
		else:
			if tmp_data != []:
				stock_matrix = logic.bind(stock_matrix, tmp_data)
			else:
				stock_matrix = [(i,j+('NA',)) for i, j in stock_matrix]

		if len(tmp_data) != 0:
			# here will use the redundant values of spread and 
			# total prices as the ranking criterian, 2013/10/19
			ave_spread = np.average(tmp_spread)
			ave_vol = np.average(tmp_vol)
			score_dict[symbol] = (ave_spread, ave_vol)
		else:
			score_dict[symbol] = (None, None)

		c = c+1

	# selecting symbols and prepare the matrix
	selected_index = naive_ranking.naive_ranking(score_dict)
	raw_stock_matrix = truncate(start, end, stock_matrix)
	stock_matrix = []
	for t, s in raw_stock_matrix:
		tmp_list = []
		for i in selected_index:
			tmp_list.append(s[i])
		stock_matrix.append((t, tuple(tmp_list)))
	
	matrix_title = []
	for i in selected_index:
		matrix_title.append(settings.SYMBOLS[i])
			
	return stock_matrix, matrix_title
Ejemplo n.º 3
0
def parser_for_prediction(data_folders, selected_symbols, 
					start=settings.S_TRAIN_DATE, end=settings.Time_Slot,
					save_to_dict=False):
	'''
	from the data folder, get the truncate ts data and the average spread/vol
	'''

	ts_data_dict, score_dict = {}, {}
	stock_matrix = []

	c = 0
	for symbol in selected_symbols:
		print c, symbol
		tmp_timeline, tmp_ave, tmp_spread, tmp_vol = [], [], [], []
		for folder in data_folders:
			try:
				filename = [os.path.join(folder, filename) 
						for filename in os.listdir(folder) 
						if symbol in filename][0]
			except IndexError:
				print "Warning: empty file of %s in the data folder %s" %(symbol, folder)
				continue

			# use txt reader
			test_t = datetime.strptime(
				open(filename).readline().split(',')[0][:15], "%Y%m%d %H%M%S"
				) # initial test_t
			with open(filename) as f:
				for line in f:
					tmp_line = line.split(',')
					t = datetime.strptime(tmp_line[0][:15], "%Y%m%d %H%M%S")
					b1, s1, v = float(tmp_line[2]), float(tmp_line[12]), float(tmp_line[-1]) 
					ave_price = (b1 + s1)/2.
					spread = (b1 - s1)/2.
					tmp_timeline.append(t)
					tmp_ave.append(ave_price)
					tmp_spread.append(spread)
					#print t, test_t
					if t.day > test_t.day:
						tmp_vol.append(cache_v)
					else:
						test_t = t; cache_v = v
				tmp_vol.append(cache_v)
				pass
			pass

		# truncate the raw data to the time series data
		tmp_data = np.vstack((tmp_timeline, tmp_ave)).T
		# rolling the data
		# NOTE: the start and end used here is meaningless, BUG, Lance, 2013/10/20
		tmp_data = rolling.rolling_anal(start, end, settings.WINDOW_SIZE, tmp_data[:, 0], tmp_data[:, 1])

		# save the numpy array to the dict if request
		if save_to_dict:
			ts_data_dict[symbol] = tmp_data

		# convert to the tuple, make it faster for binding
		tmp_data = [(a[0], tuple(a[1:])) for a in tmp_data] 

		if c == 0:
			stock_matrix = tmp_data
		else:
			stock_matrix = logic.merge(stock_matrix, tmp_data)

		c = c+1

	stock_matrix = truncate(start, end, stock_matrix)
	return stock_matrix