def main(algo_id=None, train_steps=None, download_feature_db=None, do_preprocessing=None, upload_model=None): try: _start = time.time() cfg.load( Config(algo_id, train_steps, download_feature_db, do_preprocessing, upload_model)) cfg.cls_data_formatter = os.path.normpath( os.path.join(os.path.dirname(__file__), 'data_formatter.py')) cfg.cls_coder = os.path.normpath( os.path.join(os.path.dirname(__file__), 'coder.py')) Env.init() preprocessor = Preprocessor() preprocessor.process() trainer = Trainer() trainer.train() Uploader.upload_model() log.info("[total] use {} seconds totally".format(time.time() - _start)) except Exception as e: import traceback log.info(traceback.format_exc()) time.sleep(60 * 60 * 24) # wait to check logs
def create_feature_columns(self, tf_transform_output): """ Returns feature columns to be used by the model """ # Define the feature columns: this is how the transformed inputs are # used by the tf model, the output of the feature columns will be # stacked into the `tf.feature_column.input_layer`. This object can # then be used by the downstream tf graph (e.g., network layers). base_features_columns = [] for key in self.data_formatter.vocabulary_features: fc = tf.feature_column.indicator_column( tf.feature_column.categorical_column_with_vocabulary_file( key=key, num_oov_buckets=1, vocabulary_file=tf_transform_output. vocabulary_file_by_name(vocab_filename=key))) base_features_columns.append(fc) # NUM_INT and NUM_FLOAT, already converted to numeric value by tft and scaled base_features_columns += [ tf.feature_column.numeric_column(key, default_value=0.) for key in self.data_formatter.number_features ] log.info('len of features_columns: {}'.format( len(base_features_columns))) for fc in base_features_columns: log.info('feature column {}'.format(fc.name)) return base_features_columns
def split_to_shards(self, file_name): # print("start to split into shards") def split_file(fname_in, fname_out, num_shards): f_outs = list() for i in range(num_shards): f_outs.append( open('{}-{:05}-of-{:05}'.format(fname_out, i, num_shards), 'w+')) f_in = open(fname_in, 'rb') r_c = 0 for line in f_in: f_outs[r_c].write(line) r_c = (r_c + 1) % num_shards for f_out in f_outs: f_out.close() f_in.close() st = time.time() assert isinstance(cfg.DATASET_NUM_SHARDS, int) splitter = mp.Process(target=split_file, args=(cfg.get_shuffled_file(file_name), cfg.get_shard_file(file_name), cfg.DATASET_NUM_SHARDS)) splitter.start() splitter.join() # After splitting, remove original shuffed file. # os.remove(self.train_split_fname_out) # os.remove(self.eval_split_fname_out) log.info( 'complete split Train and Eval files into serval shards. use {:.2f} sec.' .format(time.time() - st))
def execute(self, all_shares, start_date, end_date): """ extract all the main method of a extractor, which is responsible to get the data from somewhere :param all_shares: :param start_date: :param end_date: :return: """ index = 0 for share in all_shares: try: index = index + 1 print( "[SequenceExtractor] process the {} shares".format(index)) self.extract_one_share(share, start_date, end_date) except Exception as e: log.info( "[SequenceExtractor] fail to extract share_id={}, start_date={}, end_date={}" .format(share, start_date, end_date)) log.error("[error]" + traceback.format_exc()) time.sleep(1) # you should close the sdk when finishing extraction self.sdk.close() self._validate_data()
def _extract_one_share_n_days_price_ratio(self, share_id, bar_df, field="close"): """ uniform method for getting price of HLOC :param share_id: :param bar_df: :param field: :return: """ price_s = bar_df[field] price_list = price_s.tolist() date_list = price_s.index.tolist() log.info("[extractor] share_id= {}. there are {} rows of price".format( share_id, len(price_list))) # due to the order of xxx_price in tushare is DESC, so we can get the data from recently. # example keys, values = [], [] n = feature_definition_config["hloc_seq_step"] keys = keys + ["time", "share_id"] keys = keys + [ "{}_b".format(field) + str(i - 1) for i in range(n - 1, 0, -1) ] keys = keys + ["target_{}_price".format(field)] keys = keys + ["target_{}_trend".format(field)] for index in range(len(price_list)): if len(price_list[index:index + n] ) == n: # else: not enough (N) data, so drop it. price_segment = price_list[index:index + n][::-1] if self.normalized: price_x = [1] + [ curr / price_segment[i] for i, curr in enumerate(price_segment[1:-1]) ] price_y = price_segment[-1] / price_segment[-2] else: price_x = price_segment[:-1] price_y = price_segment[-1] precision = 4 price_x = [round(x, precision) for x in price_x] price_y = round(price_y, precision) if self.normalized: trend = 1 if price_y > 1 else 0 else: trend = 1 if price_segment[-1] / price_segment[ -2] > 1 else 0 values.append([str(date_list[index]), str(share_id)] + price_x + [price_y] + [trend]) return pd.DataFrame(columns=keys, data=values)
def _extract_one_share_n_days_ror(self, bar_df, share_id): # reverse the original data frame, so that it make us easy to calculate the RoR (return of rate) bar_df = bar_df.iloc[::-1] close_s = bar_df['close'] # print(close_s) close_list = close_s.tolist() date_list = close_s.index.tolist() keys, values = [], [] # similar to __extract_one_share_n_days_close, only reverse the order. # here, we calculate ror_5_days, ror_10_days ... we should use the max N in the loop. n = feature_definition_config["ror_n_days_after"] keys += ["time", "share_id"] keys += [ "ror_1_days", "ror_5_days", "ror_10_days", "ror_20_days", "ror_40_days", "ror_60_days" ] try: for index in range(len(close_list)): if len(close_list[index:index + n] ) == n: # else: not enough (N) data, so drop it. ror1 = round( (close_list[index + 1] / close_list[index]) - 1, 4) ror5 = round( (close_list[index + 5] / close_list[index]) - 1, 4) ror10 = round( (close_list[index + 10] / close_list[index]) - 1, 4) ror20 = round( (close_list[index + 20] / close_list[index]) - 1, 4) ror40 = round( (close_list[index + 40] / close_list[index]) - 1, 4) ror60 = round( (close_list[index + 60] / close_list[index]) - 1, 4) values.append([ str(date_list[index]), str(share_id), ror1, ror5, ror10, ror20, ror40, ror60 ]) except IndexError: log.info( "share_id = {} calculate ror maybe complete ".format(share_id)) return pd.DataFrame(columns=keys, data=values)
def _extract_one_share_n_days_close(self, bar_df, share_id): close_s = bar_df['close'] # print(close_s) close_list = close_s.tolist() date_list = close_s.index.tolist() log.info("[extractor] share_id= {}. there are {} rows of close price". format(share_id, len(close_list))) # due to the order of close_price in tushare is DESC, so we can get the data from recently. keys, values = [], [] n = feature_definition_config["close_n_days_before"] keys = keys + ["time", "share_id", "close_price", "target_close_price"] # normalize: # the first element as 1, the use the `ratio` to normalize. assert n > 2 for index in range(len(close_list)): if len(close_list[index:index + n] ) == n: # else: not enough (N) data, so drop it. close_seq = close_list[index:index + n] if self.normalized: close_price = [1] + [ curr / close_seq[i] for i, curr in enumerate(close_seq[1:]) ] target_close_price = close_seq[-1] / close_seq[-2] else: close_price = close_seq[:-2] target_close_price = close_seq[-1] # adjust the precision for float precision = 4 close_price = [round(x, precision) for x in close_price] target_close_price = round(target_close_price, precision) values.append([ str(date_list[index]), str(share_id), close_price, target_close_price ]) return pd.DataFrame(columns=keys, data=values)
def main(): try: _start = time.time() cfg.load(Config()) cfg.cls_data_formatter = os.path.normpath( os.path.join(os.path.dirname(__file__), 'data_formatter.py')) cfg.cls_coder = os.path.normpath( os.path.join(os.path.dirname(__file__), 'coder.py')) preprocessor = Preprocessor() preprocessor.process() trainer = Trainer() trainer.train() log.info("[total] use {} seconds totally".format(time.time() - _start)) except Exception as e: import traceback log.info(traceback.format_exc()) time.sleep(60 * 60 * 24) # wait to check logs
def _extract_one_share_n_days_vol(self, bar_df, share_id): vol_s = bar_df['vol'] # print(close_s) vol_list = vol_s.tolist() date_list = vol_s.index.tolist() log.info( "[extractor] share_id= {}. there are {} rows of volume".format( share_id, len(vol_list))) # this part is the same as _extract_one_share_n_days_close keys, values = [], [] n = feature_definition_config["close_n_days_before"] keys = keys + ["time", "share_id", "volume", "target_volume"] assert n > 2 for index in range(len(vol_list)): if len(vol_list[index:index + n]) == n: # else: not enough (N) data, so drop it. seq = vol_list[index:index + n] if self.normalized: volume = [1] + [ curr / seq[i] for i, curr in enumerate(seq[1:]) ] target_volume = seq[-1] / seq[-2] else: volume = seq[:-2] target_volume = seq[-1] # adjust the precision for float precision = 4 volume = [round(x, precision) for x in volume] target_volume = round(target_volume, precision) values.append([ str(date_list[index]), str(share_id), volume, target_volume ]) return pd.DataFrame(columns=keys, data=values)
def shuf(self, file_name): st = time.time() log.info('shuff start') # Use terashuf quasi-shuffle FileUtil.save_remove_first_line( cfg.get_exp_file(file_name), cfg.get_exp_file_without_header(file_name)) shuf_cmd = 'MEMORY={:.1f} terashuf < {} > {}'.format( cfg.SHUF_MEM, cfg.get_exp_file_without_header(file_name), cfg.get_shuffled_file(file_name)) log.info('Executing shuf call: \"{}\"'.format(shuf_cmd)) ret_stat = os.system(shuf_cmd) if ret_stat != 0: log.info('`terashuf` failed, falling back to `sort -R`.') shuf_cmd = 'sort -R {} -o {}'.format( cfg.get_exp_file_without_header(file_name), cfg.get_shuffled_file(file_name)) ret_stat = os.system(shuf_cmd) log.info('Executing shuf call: \"{}\"'.format(shuf_cmd)) log.info("complete shuff. use time {:.2f}s".format(time.time() - st))
def main(test=None): _start = time.time() econfig.init(test) extractor = FeatureExtractor() # select the target shares first # shangzheng 50 ingredient_df = context.tushare.index_weight(index_code='000016.SH', start_date='20080101', end_date='20180101') ingredient_share_set = set(ingredient_df['con_code'].tolist()) target_shares = [ "601398.SH", "601288.SH", "601988.SH", "601939.SH", "601328.SH", ] if econfig.DEBUG: # extractor.extract_all(start_date='20080101', end_date='20080301', # params={'normalized': True, 'output_name': TRAIN_FILE_NAME}) # as train # extractor.extract_all(start_date='20190101', end_date='20190301', # params={'normalized': True, 'output_name': EVAL_FILE_NAME}) # as eval # extractor.extract_multiple(share_ids=list(ingredient_share_set), start_date='20080101', end_date='20180101', # params={'normalized': True, 'output_name': TRAIN_FILE_NAME}) # as train # extractor.extract_multiple(share_ids=list(ingredient_share_set), start_date='20180102', end_date='20190901', # params={'normalized': True, 'output_name': EVAL_FILE_NAME}) # as eval extractor.extract_multiple(share_ids=target_shares, start_date='20080101', end_date='20180101', params={ 'normalized': False, 'output_name': TRAIN_FILE_NAME }) # as train extractor.extract_multiple(share_ids=target_shares, start_date='20180102', end_date='20190901', params={ 'normalized': False, 'output_name': EVAL_FILE_NAME }) # as eval else: # for test stage1, we should only extract recent 4000 days's data # extractor.extract_all(start_date='20050101', end_date='20181231') # extractor.extract_all(start_date='20080101', end_date='20180101', # params={'normalized': True, 'output_name': TRAIN_FILE_NAME}) # as train # extractor.extract_all(start_date='20180102', end_date='20190901', # params={'normalized': True, 'output_name': EVAL_FILE_NAME}) # as eval extractor.extract_multiple(share_ids=list(ingredient_share_set), start_date='20180101', end_date='20190601', params={ 'normalized': True, 'output_name': TRAIN_FILE_NAME }) # as train extractor.extract_multiple(share_ids=list(ingredient_share_set), start_date='20190602', end_date='20191101', params={ 'normalized': True, 'output_name': EVAL_FILE_NAME }) # as eval # extractor.extract_multiple(share_ids=target_shares, start_date='20080101', end_date='20180101', # params={'normalized': True, 'output_name': TRAIN_FILE_NAME}) # as train # extractor.extract_multiple(share_ids=target_shares, start_date='20180102', end_date='20190901', # params={'normalized': True, 'output_name': EVAL_FILE_NAME}) # as eval log.info("extracting completed, use time {}s".format( str(time.time() - _start)))