def download_current_dataset(self, dest_path=".", dest_filename=None, unzip=True): """download dataset for current round dest_path: desired location of dataset file (optional) dest_filename: desired filename of dataset file (optional) unzip: indicates whether to unzip dataset """ # set up download path if dest_filename is None: round_number = self.get_current_round() dest_filename = "numerai_dataset_{0}.zip".format(round_number) else: # ensure it ends with ".zip" if unzip and not dest_filename.endswith(".zip"): dest_filename += ".zip" dataset_path = os.path.join(dest_path, dest_filename) if os.path.exists(dataset_path): self.logger.info("target file already exists") return dataset_path # create parent folder if necessary utils.ensure_directory_exists(dest_path) url = self.get_dataset_url() utils.download_file(url, dataset_path, self.show_progress_bars) # unzip dataset if unzip: # remove the ".zip" in the end dataset_name = dest_filename[:-4] self._unzip_file(dataset_path, dest_path, dataset_name) return dataset_path
def test_ensure_directory_exists(tmpdir): path = str(tmpdir.join("somedirectory")) utils.ensure_directory_exists(path) assert os.path.exists(path) # doing it again with the same (existing) path utils.ensure_directory_exists(path) assert os.path.exists(path)
def on_new_training_data(self, round_number): # Get tournament name napi = RobustNumerAPI() if self.tournament_id is None: self.tournament_id = self.numerauto.tournament_id tournament_name = napi.tournament_number2name(self.tournament_id) train_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) / 'numerai_training_data.csv', header=0) target_columns = set([x for x in list(train_x) if x[0:7] == 'target_']) train_y = train_x['target_' + tournament_name].as_matrix() train_x = train_x.drop({'id', 'era', 'data_type'} | target_columns, axis=1).as_matrix() logger.info( 'SKLearnModelTrainer(%s): Fitting model for tournament %s round %d', self.name, tournament_name, round_number) model = self.model_factory() model.fit(train_x, train_y) ensure_directory_exists( Path('./models/tournament_{}/round_{}'.format( tournament_name, round_number))) model_filename = Path('./models/tournament_{}/round_{}/{}.p'.format( tournament_name, round_number, self.name)) pickle.dump(model, open(model_filename, 'wb'))
def on_new_tournament_data(self, round_number): # Get tournament name napi = RobustNumerAPI() if self.tournament_id is None: self.tournament_id = self.numerauto.tournament_id tournament_name = napi.tournament_number2name(self.tournament_id) test_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) / 'numerai_tournament_data.csv', header=0) target_columns = set([x for x in list(test_x) if x[0:7] == 'target_']) test_ids = test_x['id'] test_x = test_x.drop({'id', 'era', 'data_type'} | target_columns, axis=1).as_matrix() logger.info( 'SKLearnModelTrainer(%s): Applying model for tournament %s round %d', self.name, tournament_name, round_number) model_filename = Path('./models/tournament_{}/round_{}/{}.p'.format( tournament_name, self.numerauto.persistent_state['last_round_trained'], self.name)) model = pickle.load(open(model_filename, 'rb')) predictions = model.predict_proba(test_x)[:, 1] df = pd.DataFrame(predictions, columns=['probability_' + tournament_name], index=test_ids) ensure_directory_exists( Path('./predictions/tournament_{}/round_{}'.format( tournament_name, round_number))) df.to_csv(Path('./predictions/tournament_{}/round_{}/{}.csv'.format( tournament_name, round_number, self.name)), index_label='id', float_format='%.8f')
def on_new_training_data(self, round_number): tournament_name = self.numerauto.tournaments[self.tournament_id] train_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) / 'numerai_training_data.csv', header=0) target_columns = set([x for x in list(train_x) if x[0:7] == 'target_']) train_y = train_x['target_' + tournament_name].values train_x = train_x.drop({'id', 'era', 'data_type'} | target_columns, axis=1).values logger.info( 'SKLearnModelTrainer(%s): Fitting model for tournament %s round %d', self.name, tournament_name, round_number) model = self.model_factory() model.fit(train_x, train_y) ensure_directory_exists( self.numerauto.config['model_directory'] / 'tournament_{}/round_{}'.format(tournament_name, round_number)) model_filename = self.numerauto.config[ 'model_directory'] / 'tournament_{}/round_{}/{}.p'.format( tournament_name, round_number, self.name) pickle.dump(model, open(model_filename, 'wb')) self.numerauto.report['training'][tournament_name][ self.name]['filename'] = model_filename
def _unzip_file(self, src_path, dest_path, filename): """unzips file located at src_path into destination_path""" self.logger.info("unzipping file...") # construct full path (including file name) for unzipping unzip_path = os.path.join(dest_path, filename) utils.ensure_directory_exists(unzip_path) # extract data with zipfile.ZipFile(src_path, "r") as z: z.extractall(unzip_path) return True
def download_latest_data(self, data_type: str, extension: str = "csv", dest_path: str = ".", dest_filename: str = None): # set up download path if dest_filename is None: dest_filename = f"latest_numerai_{data_type}_data.{extension}" dataset_path = os.path.join(dest_path, dest_filename) # create parent folder if necessary utils.ensure_directory_exists(dest_path) url = self.get_latest_data_url(data_type, extension) utils.download_file(url, dataset_path, self.show_progress_bars)
def download_current_dataset(self, dest_path=".", dest_filename=None, unzip=True, tournament=8): """Download dataset for the current active round. Args: dest_path (str, optional): destination folder, defaults to `.` dest_filename (str, optional): desired filename of dataset file, defaults to `numerai_dataset_<round number>.zip` unzip (bool, optional): indication of whether the training data should be unzipped, defaults to `True` tournament (int, optional): ID of the tournament, defaults to 8 -- DEPRECATED there is only one tournament nowadays Returns: str: Path to the downloaded dataset Example: >>> NumerAPI().download_current_dataset() ./numerai_dataset_104.zip """ # set up download path if dest_filename is None: try: round_number = self.get_current_round(tournament) except ValueError: round_number = "x" dest_filename = f"numerai_dataset_{round_number}.zip" else: # ensure it ends with ".zip" if unzip and not dest_filename.endswith(".zip"): dest_filename += ".zip" dataset_path = os.path.join(dest_path, dest_filename) if os.path.exists(dataset_path): self.logger.info("target file already exists") return dataset_path # create parent folder if necessary utils.ensure_directory_exists(dest_path) url = self.get_dataset_url(tournament) utils.download_file(url, dataset_path, self.show_progress_bars) # unzip dataset if unzip: # remove the ".zip" in the end dataset_name = dest_filename[:-4] self._unzip_file(dataset_path, dest_path, dataset_name) return dataset_path
def download_validation_data(self, dest_path: str = ".", dest_filename: str = None) -> str: """download CSV file with historical targets and ticker universe Returns: str: path to csv file Example: >>> SignalsAPI().download_validation_data() signals_train_val_bbg.csv """ # set up download path if dest_filename is None: dest_filename = "numerai_signals_historical.csv" path = os.path.join(dest_path, dest_filename) # create parent folder if necessary utils.ensure_directory_exists(dest_path) utils.download_file( self.HISTORICAL_DATA_URL, path, self.show_progress_bars) return path
def on_new_tournament_data(self, round_number): tournament_name = self.numerauto.tournaments[self.tournament_id] test_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) / 'numerai_tournament_data.csv', header=0) target_columns = set([x for x in list(test_x) if x[0:7] == 'target_']) test_ids = test_x['id'] test_x = test_x.drop({'id', 'era', 'data_type'} | target_columns, axis=1).values logger.info( 'SKLearnModelTrainer(%s): Applying model for tournament %s round %d', self.name, tournament_name, round_number) model_filename = self.numerauto.config[ 'model_directory'] / 'tournament_{}/round_{}/{}.p'.format( tournament_name, self.numerauto.persistent_state['last_round_trained'], self.name) model = pickle.load(open(model_filename, 'rb')) predictions = model.predict(test_x) df = pd.DataFrame(predictions, columns=['prediction_' + tournament_name], index=test_ids) ensure_directory_exists( self.numerauto.config['prediction_directory'] / 'tournament_{}/round_{}'.format(tournament_name, round_number)) df.to_csv(self.numerauto.config['prediction_directory'] / 'tournament_{}/round_{}/{}.csv'.format( tournament_name, round_number, self.name), index_label='id', float_format='%.8f') self.numerauto.report['predictions'][tournament_name][ self.name + '.csv']['filename'] = self.numerauto.config[ 'prediction_directory'] / 'tournament_{}/round_{}/{}.csv'.format( tournament_name, round_number, self.name)
def on_cleanup(self, round_number): # Function to turn nested_defaultdict back into normal dictionaries to_dict = lambda x: {y: to_dict(x[y]) for y in x} if type( x) == collections.defaultdict else x ensure_directory_exists(self.numerauto.config['report_directory']) filename = self.numerauto.config[ 'report_directory'] / 'round_{}.txt'.format(round_number) with open(filename, 'w') as f: # Recurse through dictionary structure and write to file def write_dict(d, indent=0): for x in d: if type(d[x]) == dict: f.write(' ' * indent + str(x) + ':\n') write_dict(d[x], indent + 1) else: f.write(' ' * indent + str(x) + ': ' + str(d[x]) + '\n') logger.debug('BasicReportWriter(%s): Writing report to file: %s', self.name, filename) write_dict(to_dict(self.numerauto.report))