Exemple #1
0
    def load(cls, fitting=None):
        model = super(Base, cls).load(fitting)
        
        if hasattr(model, 'estimator'):
            # HACK to set estimator model, and model serializer
            model.estimator = model.estimator
            
            # Rely on build + load_weights rather than loading the named layers
            # w/ Keras for efficiency (and also because it causes a
            # deserialization issue) as of Keras 2.0.4:
            # https://github.com/fchollet/keras/issues/5442
            model.estimator.build()

            try:
                with timer('load weights %i' % model.fitting):
                    model.estimator.keras.load_weights(model.weights_path())
            except ValueError as ex:
                if model.estimator.multi_gpu_model and not lore.estimators.keras.available_gpus:
                    error = "Trying to load a multi_gpu_model when no GPUs are present is not supported"
                    logger.exception(error)
                    raise NotImplementedError(error)
                else:
                    raise
                
        else:
            model.build()
            with timer('load weights'):
                model.keras.load_weights(model.weights_path())
        
        return model
Exemple #2
0
def download(remote_url, local_path=None, cache=True, extract=False):
    if re.match(r'^https?://', remote_url):
        protocol = 'http'
    else:
        protocol = 's3'
        remote_url = prefix_remote_root(remote_url)

    if cache:
        if local_path is None:
            if protocol == 'http':
                filename = urlparse(remote_url).path.split('/')[-1]
            elif protocol == 's3':
                filename = remote_url
            local_path = os.path.join(lore.env.data_dir, filename)
        
        if os.path.exists(local_path):
            return local_path
    elif local_path:
        raise ValueError("You can't pass lore.io.download(local_path=X), unless you also pass cache=True")
    elif extract:
        raise ValueError("You can't pass lore.io.download(extract=True), unless you also pass cache=True")

    with timer('DOWNLOAD: %s' % remote_url):
        temp_file, temp_path = tempfile.mkstemp()
        try:
            if protocol == 'http':
                urlretrieve(remote_url, temp_path)
            else:
                bucket.download_file(remote_url, temp_path)
        except ClientError as e:
            logger.error("Error downloading file: %s" % e)
            raise
            
    if cache:
        dir = os.path.dirname(local_path)
        if not os.path.exists(dir):
            try:
                os.makedirs(dir)
            except os.FileExistsError:
                pass  # race to create
    
        os.rename(temp_path, local_path)

        if extract:
            with timer('EXTRACT: %s' % local_path, logging.WARNING):
                if local_path[-7:] == '.tar.gz':
                    with tarfile.open(local_path, 'r:gz') as tar:
                        tar.extractall(os.path.dirname(local_path))
                elif local_path[-4:] == '.zip':
                    import zipfile
                    with zipfile.ZipFile(local_path, 'r') as zip:
                        zip.extractall(os.path.dirname(local_path))

    else:
        local_path = temp_path
    return local_path
Exemple #3
0
    def __setitem__(self, key, value):
        with timer('write %s' % key):
            with open(self._path(key), 'wb') as f:
                pickle.dump(value, f, pickle.HIGHEST_PROTOCOL)

        if self.limit is not None:
            if os.path.getsize(self._path(key)) > self.limit:
                raise MemoryError(
                    'disk cache limit exceeded by single key: %s' % key)

            with timer('evict %s' % key):
                while self.size() > self.limit:
                    del self[self.lru()]
Exemple #4
0
 def reverse_transform(self, array):
     with timer('reverse_transform %s' % self.name, logging.DEBUG):
         data = pandas.DataFrame(array)
         for column in data:
             data[column] = super(Token,
                                  self).reverse_transform(data[column])
         return data.T.apply(' '.join)
Exemple #5
0
    def load_dataframe(self, key, columns, log_verbose=False,):
        with timer('load_dataframe'):
            frames = []
            for entry in lore.io.bucket.objects.filter(
                Prefix=os.path.join(self.UNLOAD_PREFIX, key)
            ):
                temp = tempfile.NamedTemporaryFile()
                lore.io.bucket.download_file(entry.key, temp.name)
                dataframe = pandas.read_csv(
                    temp.name,
                    delimiter='|',
                    quotechar='"',
                    compression='gzip',
                    error_bad_lines=False
                )
                dataframe.columns = columns
                frames.append(dataframe)

            result = pandas.concat(frames)
            result.columns = columns
            if log_verbose is True:
                buffer = io.StringIO()
                result.info(buf=buffer, memory_usage='deep')
                logger.info(buffer.getvalue())
                logger.info(result.head())
            return result
Exemple #6
0
    def load(self, fitting=None):
        if fitting:
            self.fitting = fitting

        with timer('unpickle model'):
            with open(self.model_path, 'rb') as f:
                self.model = pickle.load(f)
Exemple #7
0
    def fit(self, data):
        with timer(('fit %s' % self.name), logging.DEBUG):
            if self.correlation:
                if not isinstance(self.column, str):
                    raise "Can not correlate with non native columns"

                self.map = MissingValueMap(
                    data.groupby(
                        self.column)[self.correlation].mean().to_dict())
                self.missing_value = data[self.column].mean()
            else:
                if self.stratify:
                    ids = pandas.DataFrame({
                        'id': self.series(data),
                        'stratify': data[self.stratify]
                    }).drop_duplicates()
                else:
                    ids = pandas.DataFrame({'id': self.series(data)})
                counts = pandas.DataFrame({'n': ids.groupby('id').size()})
                qualified = counts[counts.n >= self.minimum_occurrences].copy()
                qualified['encoded_id'] = numpy.arange(len(qualified)) + 2

                self.map = MissingValueMap(qualified.to_dict()['encoded_id'])
                self.missing_value = len(self.map) + 2

            self.inverse = {v: k for k, v in self.map.items()}
            self.inverse[self.tail_value] = 'LONG_TAIL'
            self.dtype = self._type_from_cardinality()
Exemple #8
0
 def reverse_transform(self, array):
     with timer('reverse_transform %s' % self.name, logging.DEBUG):
         data = pandas.DataFrame(array)
         for column in data:
             data[column] = super(NestedUnique,
                                  self).reverse_transform(data[column])
         return numpy.array(data)
Exemple #9
0
 def unnest(self, data, fit=False):
     """
     :param data: a dataframe containing a column to be unnested
     :param fit: if True, self.sequence_length will exactly accomodate the largest sequence length
     :return: 1D array of values with length = rows * sequence_length
     """
     with timer('unnest %s' % self.name, logging.DEBUG):
         raw = self.series(data)
         # lengths of every sequence
         lengths = [0 if x is None or (isinstance(x, float) and numpy.isnan(x)) else len(x) for x in raw.values]
         if fit and self.sequence_length is None:
             self.sequence_length = numpy.max(lengths)
         # Make them all the same size
         def fill_x(x, length):
             x_new = numpy.empty(length, dtype='float')
             x_new[:] = numpy.nan
             if x is None or (isinstance(x, float) and numpy.isnan(x)):
                 return x_new
             fill_length = min(len(x), length)
             x_new[0:fill_length] = x[0:fill_length]
             return x_new
         same_size = [fill_x(x, self.sequence_length) for x in raw.values]
         # Flatten
         flattened = [item for sublist in same_size for item in sublist]
         return pandas.DataFrame({self.column: flattened})
Exemple #10
0
 def fit(self, data):
     with timer(('fit %s' % self.name), logging.DEBUG):
         series = self.series(data).astype(self.dtype)
         self.__min = float(series.min())
         self.__max = float(series.max())
         self.__mean = numpy.mean(series)
         self.__std = numpy.std(series)
Exemple #11
0
    def fit(self, data):
        with timer(('fit %s' % self.name), logging.DEBUG):
            if self.stratify:
                ids = pandas.DataFrame({
                    'id': self.series(data),
                    'stratify': data[self.stratify],
                }).drop_duplicates()
            else:
                ids = pandas.DataFrame({'id': self.series(data)})
                if self.correlation:
                    ids['correlation'] = data[self.correlation]

            counts = pandas.DataFrame({'n': ids.groupby('id').size()})
            if self.correlation:
                counts['correlation'] = ids.groupby('id')['correlation'].mean()
                counts = counts.sort_values('correlation')

            qualified = counts[counts.n >= self.minimum_occurrences].copy()
            qualified['encoded_id'] = numpy.arange(len(qualified)) + 2

            self.map = MissingValueMap(qualified.to_dict()['encoded_id'])
            self.missing_value = len(self.map) + 2

            self.inverse = {v: k for k, v in self.map.items()}
            self.inverse[self.tail_value] = 'LONG_TAIL'
            self.dtype = self._type_from_cardinality()
Exemple #12
0
    def encoded_validation_data(self):
        if not self._encoded_validation_data:
            with timer('encode validation data'):
                self._encoded_validation_data = self.observations(
                    self.validation_data)

        return self._encoded_validation_data
Exemple #13
0
 def transform(self, data):
     with timer('transform %s' % (self.name), logging.DEBUG):
         series = self.series(data).astype(numpy.float16)
         null = series.isnull()
         series[series != 0] = 1
         series[null] = self.missing_value
         return series.astype(numpy.uint8).values
Exemple #14
0
    def output_encoder(self):
        if self._output_encoder is None:
            with timer('fit output encoder'):
                self._output_encoder = self.get_output_encoder()
                self._output_encoder.fit(self.training_data)

        return self._output_encoder
Exemple #15
0
 def output_encoder(self):
     if self._output_encoder is None:
         with timer('fit output encoder'):
             self._output_encoder = self.get_output_encoder()
             self._output_encoder.fit(self.read_column(self.table_training, self._output_encoder.source_column))
 
     return self._output_encoder
Exemple #16
0
    def save(self, stats=None):
        if self.fitting is None:
            raise ValueError(
                "This model has not been fit yet. There is no point in saving."
            )

        if not os.path.exists(self.fitting_path()):
            try:
                os.makedirs(self.fitting_path())
            except FileExistsError as ex:
                pass  # race to create

        with timer('pickle model'):
            with open(self.model_path(), 'wb') as f:
                pickle.dump(self, f)

        with open(join(self.fitting_path(), 'params.json'), 'w') as f:
            params = {}
            for child in [self.estimator, self.pipeline]:
                param = child.__module__ + '.' + child.__class__.__name__
                params[param] = {}
                if hasattr(child, '__getstate__'):
                    state = child.__getstate__()
                else:
                    state = child.__dict__
                for key, value in state.items():
                    if not key.startswith('_'):
                        params[param][key] = value.__repr__()
            json.dump(params, f, indent=2, sort_keys=True)

        if stats:
            with open(join(self.fitting_path(), 'stats.json'), 'w') as f:
                json.dump(stats, f, indent=2, sort_keys=True)
Exemple #17
0
 def transform(self, data):
     with timer('transform %s' % self.name, logging.DEBUG):
         series = self.series(data)
         null = series.isnull()
         series = numpy.log1p(numpy.maximum(series.fillna(0), 0))
         series[null] = float('nan')
         return series
Exemple #18
0
    def transform(self, data):
        with timer('transform %s' % self.name, logging.DEBUG):
            series = self.series(data)
            other = self.other_series(data)
            if other is None:
                other = datetime.datetime.now()
            elif other.dtype != 'datetime64[ns]':
                logger.warning('%s is not a datetime. Converting to datetime64[ns]' % self.column)
                other = pandas.to_datetime(other).astype('datetime64[ns]')

            if series.dtype != 'datetime64[ns]':
                logger.warning('%s is not a datetime. Converting to datetime64[ns]' % self.column)
                series = pandas.to_datetime(series).astype('datetime64[ns]')

            age = (other - series)
            if self.unit in ['nanosecond', 'nanoseconds']:
                return age

            seconds = age.dt.total_seconds()
            if self.unit in ['second', 'seconds']:
                return seconds
            if self.unit in ['minute', 'minutes']:
                return seconds / 60
            if self.unit in ['hour', 'hours']:
                return seconds / 3600
            if self.unit in ['day', 'days']:
                return seconds / 86400
            if self.unit in ['week', 'weeks']:
                return seconds / 604800
            if self.unit in ['month', 'months']:
                return seconds / 2592000
            if self.unit in ['year', 'years']:
                return seconds / 31536000

            raise NameError('Unknown unit: %s' % self.unit)
Exemple #19
0
 def transform(self, data):
     with timer('transform %s' % self.name, logging.DEBUG):
         series = self.series(data)
         result = pandas.Series(series, copy=True)
         result[(series > self.__max) | (series < 0)] = self.unfit_value
         result[series.isnull()] = self.missing_value
         return result.astype(self.dtype).values
Exemple #20
0
    def encoded_training_data(self):
        if not self._encoded_training_data:
            with timer('encode training data'):
                self._encoded_training_data = self.observations(
                    self.training_data)

        return self._encoded_training_data
Exemple #21
0
 def transform(self, data):
     with timer('transform one_hot %s:' % self.name, logging.DEBUG):
         dummies = self.get_dummies(data)
         for col in [
                 c for c in self.dummy_columns if c not in dummies.columns
         ]:
             dummies[col] = 0
         return dummies[self.dummy_columns]
Exemple #22
0
 def on_train_end(self, logs=None):
     super(ReloadBest, self).on_train_end(logs)
     logger.info('=============================================')
     if self.best_epoch is not None:
         logger.debug('best epoch: %i' % self.best_epoch)
         with timer('load best epoch'):
             self.model.load_weights(
                 self.filepath.format(epoch=self.best_epoch))
Exemple #23
0
 def _dataframe(self, sql, bindings):
     with timer("dataframe:"):
         if self._connection is None:
             self._connection = self._engine.connect()
         dataframe = pandas.read_sql(sql=sql,
                                     con=self._connection,
                                     params=bindings)
         return dataframe
Exemple #24
0
 def _dataframe(self, sql, bindings, chunksize=None):
     with timer("dataframe:"):
         if self._connection is None:
             self._connection = self._engine.connect()
         return pandas.read_sql(sql=sql,
                                con=self._connection,
                                params=bindings,
                                chunksize=chunksize)
Exemple #25
0
 def encoders(self):
     if self._encoders is None:
         with timer('fit encoders'):
             self._encoders = self.get_encoders()
             for encoder in self._encoders:
                 encoder.fit(self.read_column(self.table_training, encoder.source_column))
 
     return self._encoders
Exemple #26
0
 def transform(self, data):
     """
     :param data: DataFrame with column to encode
     :return: encoded Series
     """
     with timer('transform %s' % self.name, logging.DEBUG):
         transformed = super(NestedNorm, self).transform(self.unnest(data))
         return transformed.reshape((len(data), self.sequence_length))
Exemple #27
0
 def transform(self, data):
     with timer('transform %s' % (self.name), logging.DEBUG):
         series = self.series(data)
         cut = pandas.cut(series, bins=self.bins, labels=False, include_lowest=True)
         cut[series < self.lower_bound] = self.quantiles
         cut[series > self.upper_bound] = self.quantiles + 1
         cut[series.isnull()] = self.missing_value
         return cut.astype(self.dtype).values
Exemple #28
0
    def replace(self, table, dataframe, batch_size=10**5):
        import migrate.changeset
        global _after_replace_callbacks

        with timer('REPLACE ' + table):
            suffix = datetime.now().strftime('_%Y%m%d%H%M%S').encode('utf-8')
            self.metadata
            temp = 'tmp_'.encode('utf-8')
            source = sqlalchemy.Table(table,
                                      self.metadata,
                                      autoload=True,
                                      autoload_with=self._engine)
            destination_name = 'tmp_' + hashlib.sha256(
                temp + table.encode('utf-8') + suffix).hexdigest()[0:56]
            destination = sqlalchemy.Table(destination_name,
                                           self.metadata,
                                           autoload=False)
            for column in source.columns:
                destination.append_column(column.copy())
            destination.create()

            original_names = {}
            for index in source.indexes:
                # make sure the name is < 63 chars with the suffix
                name = hashlib.sha256(temp + index.name.encode('utf-8') +
                                      suffix).hexdigest()[0:60]
                original_names[name] = index.name
                columns = []
                for column in index.columns:
                    columns.append(
                        next(x for x in destination.columns
                             if x.name == column.name))
                new = sqlalchemy.Index(name, *columns)
                new.unique = index.unique
                new.table = destination
                new.create(bind=self._connection)
            self.insert(destination.name, dataframe, batch_size=batch_size)
            self.execute(
                sql=
                "BEGIN; SET LOCAL statement_timeout = '1min'; ANALYZE %s; COMMIT;"
                % self.quote_identifier(table))

            with self as transaction:
                backup = sqlalchemy.Table(table + '_b', self.metadata)
                backup.drop(bind=self._connection, checkfirst=True)
                source.rename(name=source.name + '_b',
                              connection=self._connection)
                destination.rename(name=table, connection=self._connection)
                for index in source.indexes:
                    index.rename(index.name[0:-2] + '_b',
                                 connection=self._connection)
                for index in destination.indexes:
                    index.rename(original_names[index.name],
                                 connection=self._connection)

        for func in _after_replace_callbacks:
            func(destination, source)
Exemple #29
0
 def fit(self, data):
     with timer(('fit %s' % self.name), logging.DEBUG):
         series = self.series(data)
         self.__min = series.min()
         self.__range = series.max() - self.__min
         if isinstance(self.__range, timedelta):
             logger.warning('Discrete timedelta requires (slower) 64bit float math. '
                         'Could you use the epoch instead for %s?' % self.name)
             self.__range = self.__range.total_seconds() * 1000000000
Exemple #30
0
 def fit(self, data):
     with timer(('fit %s' % self.name), logging.DEBUG):
         series = self.series(data)
         series_cut, self.bins = pandas.qcut(series, self.quantiles, retbins=True, labels=False, duplicates='drop')
         self.quantiles = len(self.bins) - 1
         self.missing_value = self.quantiles + 2
         self.lower_bound = series.min()
         self.upper_bound = series.max()
         self.dtype = self._type_from_cardinality()