Esempio n. 1
0
    def fit(self,
            x,
            y,
            validation_data=None,
            epochs=100,
            patience=0,
            verbose=None,
            min_delta=0,
            tensorboard=False,
            timeline=False,
            **keras_kwargs):

        if validation_data is None:
            validation_data = self.model.pipeline.encoded_validation_data

        if isinstance(x, pandas.DataFrame):
            x = x.to_dict(orient='series')

        if isinstance(validation_data.x, pandas.DataFrame):
            validation_data = Observations(
                x=validation_data.x.to_dict(orient='series'),
                y=validation_data.y)

        if not self.keras or not self.optimizer:
            self.build()

        with self.session.as_default():
            if timeline:
                run_metadata = tensorflow.RunMetadata()
            else:
                run_metadata = None
            self.keras.compile(
                loss=self.loss,
                optimizer=self.optimizer,
                options=tensorflow.RunOptions(
                    trace_level=tensorflow.RunOptions.FULL_TRACE),
                run_metadata=run_metadata)
        if verbose is None:
            verbose = 1 if lore.env.name == lore.env.DEVELOPMENT else 0

        logger.info('\n'.join([
            '\n\n\n  Fitting', '==============================',
            '| batch | learning |         |', '| size  | rate     |   decay |',
            '------------------------------',
            '| %5i | %8.6f | %7.5f |' % (
                self.batch_size,
                self.learning_rate,
                self.decay,
            ), '==============================\n\n'
        ]))

        reload_best = ReloadBest(
            filepath=self.model.checkpoint_path(),
            monitor=self.monitor,
            mode='auto',
        )

        callbacks = self.callbacks()
        callbacks += [
            reload_best,
            TerminateOnNaN(),
            EarlyStopping(
                monitor=self.monitor,
                min_delta=min_delta,
                patience=patience,
                verbose=verbose,
                mode='auto',
            ),
        ]
        if tensorboard:
            callbacks += [
                TensorBoard(log_dir=self.model.serializer.tensorboard_path,
                            histogram_freq=1,
                            batch_size=self.batch_size,
                            write_graph=True,
                            write_grads=True,
                            write_images=True,
                            embeddings_freq=1,
                            embeddings_metadata=None)
            ]

        with self.session.as_default():
            self.history = self.keras.fit(
                x=x,
                y=[y] * self.towers,
                validation_data=Observations(x=validation_data.x,
                                             y=[validation_data.y] *
                                             self.towers),
                batch_size=self.batch_size,
                epochs=epochs,
                verbose=verbose,
                callbacks=callbacks,
                **keras_kwargs).history

        if timeline:
            with open(self.model.timeline_path(), 'w') as f:
                f.write(
                    Timeline(step_stats=run_metadata.step_stats).
                    generate_chrome_trace_format())

        return {
            'epochs': len(self.history['loss']),
            'train': reload_best.train_loss,
            'validate': reload_best.validate_loss,
        }
Esempio n. 2
0
 def observations(self, data):
     return Observations(x=self.encode_x(data), y=self.encode_y(data))
Esempio n. 3
0
    def generator(self, table, orient='row', encoded=False, stratify=False, chunksize=None):
        if not self.loaded:
            self._split_data()
        
        if orient == 'column':
            if encoded:
                for encoder in self.encoders:
                    transformed = encoder.transform(self.read_column(table, encoder.source_column))
                    encoded = {}
                    if hasattr(encoder, 'sequence_length'):
                        for i in range(encoder.sequence_length):
                            encoded[encoder.sequence_name(i)] = transformed[:, i]
                    else:
                        encoded[encoder.name] = transformed
                    yield Observations(x=pandas.DataFrame(encoded), y=None)

            else:
                for column in self.columns:
                    yield self.read_column(table, column)

        elif orient == 'row':
            if stratify:
                if not self.stratify:
                    raise ValueError("Can't stratify a generator for a pipeline with no stratify")
                
                if chunksize is None:
                    chunksize = 1
                min, max = self.connection.execute(
                    """
                        SELECT min({stratify}), max({stratify})
                        FROM (
                            SELECT {stratify}
                            FROM {table}
                            ORDER BY {stratify} ASC
                            LIMIT :chunksize
                        )
                    """.format(
                        stratify=self.quote(self.stratify),
                        table=self.quote(table),
                    ),
                    {'chunksize': chunksize}
                ).fetchone()
                while min and max:
                    dataframe = pandas.read_sql(
                        """
                            SELECT * FROM {table} WHERE {stratify} BETWEEN :min AND :max
                        """.format(
                            stratify=self.quote(self.stratify),
                            table=self.quote(table),
                        ),
                        self.connection,
                        parse_dates=self.datetime_columns,
                        params={'min': min, 'max': max}
                    )
                    if encoded:
                        dataframe = Observations(x=self.encode_x(dataframe), y=self.encode_y(dataframe))
                    yield dataframe
                    
                    min, max = self.connection.execute(
                        """
                            SELECT min({stratify}), max({stratify})
                            FROM (
                                SELECT {stratify}
                                FROM {table}
                                WHERE {stratify} > :max
                                ORDER BY {stratify} ASC
                                LIMIT :chunksize
                            )
                        """.format(
                            stratify=self.quote(self.stratify),
                            table=self.quote(table),
                        ),
                        {'max': max, 'chunksize': chunksize}
                    ).fetchone()
            
            else:
                if chunksize is None:
                    chunksize = self.chunksize
                for dataframe in pandas.read_sql(
                        "SELECT * FROM {name}".format(name=self.quote(table)),
                        self.connection,
                        chunksize=chunksize,
                        parse_dates=self.datetime_columns
                ):
                    if encoded:
                        dataframe = Observations(x=self.encode_x(dataframe), y=self.encode_y(dataframe))
                    yield dataframe
        else:
            raise ValueError('orient "%s" not in "[row, column]"' % orient)