Exemple #1
0
    def download_current_dataset(self, dest_path=".", dest_filename=None,
                                 unzip=True):
        """download dataset for current round

        dest_path: desired location of dataset file (optional)
        dest_filename: desired filename of dataset file (optional)
        unzip: indicates whether to unzip dataset
        """
        # set up download path
        if dest_filename is None:
            round_number = self.get_current_round()
            dest_filename = "numerai_dataset_{0}.zip".format(round_number)
        else:
            # ensure it ends with ".zip"
            if unzip and not dest_filename.endswith(".zip"):
                dest_filename += ".zip"
        dataset_path = os.path.join(dest_path, dest_filename)

        if os.path.exists(dataset_path):
            self.logger.info("target file already exists")
            return dataset_path

        # create parent folder if necessary
        utils.ensure_directory_exists(dest_path)

        url = self.get_dataset_url()
        utils.download_file(url, dataset_path, self.show_progress_bars)

        # unzip dataset
        if unzip:
            # remove the ".zip" in the end
            dataset_name = dest_filename[:-4]
            self._unzip_file(dataset_path, dest_path, dataset_name)

        return dataset_path
Exemple #2
0
def test_ensure_directory_exists(tmpdir):
    path = str(tmpdir.join("somedirectory"))
    utils.ensure_directory_exists(path)
    assert os.path.exists(path)
    # doing it again with the same (existing) path
    utils.ensure_directory_exists(path)
    assert os.path.exists(path)
Exemple #3
0
    def on_new_training_data(self, round_number):
        # Get tournament name
        napi = RobustNumerAPI()
        if self.tournament_id is None:
            self.tournament_id = self.numerauto.tournament_id
        tournament_name = napi.tournament_number2name(self.tournament_id)

        train_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) /
                              'numerai_training_data.csv',
                              header=0)
        target_columns = set([x for x in list(train_x) if x[0:7] == 'target_'])

        train_y = train_x['target_' + tournament_name].as_matrix()
        train_x = train_x.drop({'id', 'era', 'data_type'} | target_columns,
                               axis=1).as_matrix()

        logger.info(
            'SKLearnModelTrainer(%s): Fitting model for tournament %s round %d',
            self.name, tournament_name, round_number)
        model = self.model_factory()
        model.fit(train_x, train_y)

        ensure_directory_exists(
            Path('./models/tournament_{}/round_{}'.format(
                tournament_name, round_number)))
        model_filename = Path('./models/tournament_{}/round_{}/{}.p'.format(
            tournament_name, round_number, self.name))
        pickle.dump(model, open(model_filename, 'wb'))
Exemple #4
0
    def on_new_tournament_data(self, round_number):
        # Get tournament name
        napi = RobustNumerAPI()
        if self.tournament_id is None:
            self.tournament_id = self.numerauto.tournament_id
        tournament_name = napi.tournament_number2name(self.tournament_id)

        test_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) /
                             'numerai_tournament_data.csv',
                             header=0)
        target_columns = set([x for x in list(test_x) if x[0:7] == 'target_'])

        test_ids = test_x['id']
        test_x = test_x.drop({'id', 'era', 'data_type'} | target_columns,
                             axis=1).as_matrix()

        logger.info(
            'SKLearnModelTrainer(%s): Applying model for tournament %s round %d',
            self.name, tournament_name, round_number)
        model_filename = Path('./models/tournament_{}/round_{}/{}.p'.format(
            tournament_name,
            self.numerauto.persistent_state['last_round_trained'], self.name))
        model = pickle.load(open(model_filename, 'rb'))
        predictions = model.predict_proba(test_x)[:, 1]

        df = pd.DataFrame(predictions,
                          columns=['probability_' + tournament_name],
                          index=test_ids)
        ensure_directory_exists(
            Path('./predictions/tournament_{}/round_{}'.format(
                tournament_name, round_number)))
        df.to_csv(Path('./predictions/tournament_{}/round_{}/{}.csv'.format(
            tournament_name, round_number, self.name)),
                  index_label='id',
                  float_format='%.8f')
Exemple #5
0
    def on_new_training_data(self, round_number):
        tournament_name = self.numerauto.tournaments[self.tournament_id]

        train_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) /
                              'numerai_training_data.csv',
                              header=0)
        target_columns = set([x for x in list(train_x) if x[0:7] == 'target_'])

        train_y = train_x['target_' + tournament_name].values
        train_x = train_x.drop({'id', 'era', 'data_type'} | target_columns,
                               axis=1).values

        logger.info(
            'SKLearnModelTrainer(%s): Fitting model for tournament %s round %d',
            self.name, tournament_name, round_number)
        model = self.model_factory()
        model.fit(train_x, train_y)

        ensure_directory_exists(
            self.numerauto.config['model_directory'] /
            'tournament_{}/round_{}'.format(tournament_name, round_number))
        model_filename = self.numerauto.config[
            'model_directory'] / 'tournament_{}/round_{}/{}.p'.format(
                tournament_name, round_number, self.name)
        pickle.dump(model, open(model_filename, 'wb'))

        self.numerauto.report['training'][tournament_name][
            self.name]['filename'] = model_filename
Exemple #6
0
    def _unzip_file(self, src_path, dest_path, filename):
        """unzips file located at src_path into destination_path"""
        self.logger.info("unzipping file...")

        # construct full path (including file name) for unzipping
        unzip_path = os.path.join(dest_path, filename)
        utils.ensure_directory_exists(unzip_path)

        # extract data
        with zipfile.ZipFile(src_path, "r") as z:
            z.extractall(unzip_path)

        return True
Exemple #7
0
    def download_latest_data(self, data_type: str, extension: str = "csv",
                             dest_path: str = ".", dest_filename: str = None):
        # set up download path
        if dest_filename is None:
            dest_filename = f"latest_numerai_{data_type}_data.{extension}"

        dataset_path = os.path.join(dest_path, dest_filename)

        # create parent folder if necessary
        utils.ensure_directory_exists(dest_path)

        url = self.get_latest_data_url(data_type, extension)
        utils.download_file(url, dataset_path, self.show_progress_bars)
Exemple #8
0
    def download_current_dataset(self, dest_path=".", dest_filename=None,
                                 unzip=True, tournament=8):
        """Download dataset for the current active round.

        Args:
            dest_path (str, optional): destination folder, defaults to `.`
            dest_filename (str, optional): desired filename of dataset file,
                defaults to `numerai_dataset_<round number>.zip`
            unzip (bool, optional): indication of whether the training data
                should be unzipped, defaults to `True`
            tournament (int, optional): ID of the tournament, defaults to 8
                -- DEPRECATED there is only one tournament nowadays

        Returns:
            str: Path to the downloaded dataset

        Example:
            >>> NumerAPI().download_current_dataset()
            ./numerai_dataset_104.zip
        """
        # set up download path
        if dest_filename is None:
            try:
                round_number = self.get_current_round(tournament)
            except ValueError:
                round_number = "x"
            dest_filename = f"numerai_dataset_{round_number}.zip"
        else:
            # ensure it ends with ".zip"
            if unzip and not dest_filename.endswith(".zip"):
                dest_filename += ".zip"
        dataset_path = os.path.join(dest_path, dest_filename)

        if os.path.exists(dataset_path):
            self.logger.info("target file already exists")
            return dataset_path

        # create parent folder if necessary
        utils.ensure_directory_exists(dest_path)

        url = self.get_dataset_url(tournament)
        utils.download_file(url, dataset_path, self.show_progress_bars)

        # unzip dataset
        if unzip:
            # remove the ".zip" in the end
            dataset_name = dest_filename[:-4]
            self._unzip_file(dataset_path, dest_path, dataset_name)

        return dataset_path
Exemple #9
0
    def download_validation_data(self, dest_path: str = ".",
                                 dest_filename: str = None) -> str:
        """download CSV file with historical targets and ticker universe

        Returns:
            str: path to csv file

        Example:
            >>> SignalsAPI().download_validation_data()
            signals_train_val_bbg.csv
        """
        # set up download path
        if dest_filename is None:
            dest_filename = "numerai_signals_historical.csv"

        path = os.path.join(dest_path, dest_filename)

        # create parent folder if necessary
        utils.ensure_directory_exists(dest_path)
        utils.download_file(
            self.HISTORICAL_DATA_URL, path, self.show_progress_bars)
        return path
Exemple #10
0
    def on_new_tournament_data(self, round_number):
        tournament_name = self.numerauto.tournaments[self.tournament_id]

        test_x = pd.read_csv(self.numerauto.get_dataset_path(round_number) /
                             'numerai_tournament_data.csv',
                             header=0)
        target_columns = set([x for x in list(test_x) if x[0:7] == 'target_'])

        test_ids = test_x['id']
        test_x = test_x.drop({'id', 'era', 'data_type'} | target_columns,
                             axis=1).values

        logger.info(
            'SKLearnModelTrainer(%s): Applying model for tournament %s round %d',
            self.name, tournament_name, round_number)
        model_filename = self.numerauto.config[
            'model_directory'] / 'tournament_{}/round_{}/{}.p'.format(
                tournament_name,
                self.numerauto.persistent_state['last_round_trained'],
                self.name)
        model = pickle.load(open(model_filename, 'rb'))
        predictions = model.predict(test_x)

        df = pd.DataFrame(predictions,
                          columns=['prediction_' + tournament_name],
                          index=test_ids)
        ensure_directory_exists(
            self.numerauto.config['prediction_directory'] /
            'tournament_{}/round_{}'.format(tournament_name, round_number))
        df.to_csv(self.numerauto.config['prediction_directory'] /
                  'tournament_{}/round_{}/{}.csv'.format(
                      tournament_name, round_number, self.name),
                  index_label='id',
                  float_format='%.8f')

        self.numerauto.report['predictions'][tournament_name][
            self.name + '.csv']['filename'] = self.numerauto.config[
                'prediction_directory'] / 'tournament_{}/round_{}/{}.csv'.format(
                    tournament_name, round_number, self.name)
Exemple #11
0
    def on_cleanup(self, round_number):
        # Function to turn nested_defaultdict back into normal dictionaries
        to_dict = lambda x: {y: to_dict(x[y])
                             for y in x} if type(
                                 x) == collections.defaultdict else x

        ensure_directory_exists(self.numerauto.config['report_directory'])
        filename = self.numerauto.config[
            'report_directory'] / 'round_{}.txt'.format(round_number)
        with open(filename, 'w') as f:
            # Recurse through dictionary structure and write to file
            def write_dict(d, indent=0):
                for x in d:
                    if type(d[x]) == dict:
                        f.write('  ' * indent + str(x) + ':\n')
                        write_dict(d[x], indent + 1)
                    else:
                        f.write('  ' * indent + str(x) + ': ' + str(d[x]) +
                                '\n')

            logger.debug('BasicReportWriter(%s): Writing report to file: %s',
                         self.name, filename)
            write_dict(to_dict(self.numerauto.report))