def load(cls, fitting=None): model = super(Base, cls).load(fitting) if hasattr(model, 'estimator'): # HACK to set estimator model, and model serializer model.estimator = model.estimator # Rely on build + load_weights rather than loading the named layers # w/ Keras for efficiency (and also because it causes a # deserialization issue) as of Keras 2.0.4: # https://github.com/fchollet/keras/issues/5442 model.estimator.build() try: with timer('load weights %i' % model.fitting): model.estimator.keras.load_weights(model.weights_path()) except ValueError as ex: if model.estimator.multi_gpu_model and not lore.estimators.keras.available_gpus: error = "Trying to load a multi_gpu_model when no GPUs are present is not supported" logger.exception(error) raise NotImplementedError(error) else: raise else: model.build() with timer('load weights'): model.keras.load_weights(model.weights_path()) return model
def download(remote_url, local_path=None, cache=True, extract=False): if re.match(r'^https?://', remote_url): protocol = 'http' else: protocol = 's3' remote_url = prefix_remote_root(remote_url) if cache: if local_path is None: if protocol == 'http': filename = urlparse(remote_url).path.split('/')[-1] elif protocol == 's3': filename = remote_url local_path = os.path.join(lore.env.data_dir, filename) if os.path.exists(local_path): return local_path elif local_path: raise ValueError("You can't pass lore.io.download(local_path=X), unless you also pass cache=True") elif extract: raise ValueError("You can't pass lore.io.download(extract=True), unless you also pass cache=True") with timer('DOWNLOAD: %s' % remote_url): temp_file, temp_path = tempfile.mkstemp() try: if protocol == 'http': urlretrieve(remote_url, temp_path) else: bucket.download_file(remote_url, temp_path) except ClientError as e: logger.error("Error downloading file: %s" % e) raise if cache: dir = os.path.dirname(local_path) if not os.path.exists(dir): try: os.makedirs(dir) except os.FileExistsError: pass # race to create os.rename(temp_path, local_path) if extract: with timer('EXTRACT: %s' % local_path, logging.WARNING): if local_path[-7:] == '.tar.gz': with tarfile.open(local_path, 'r:gz') as tar: tar.extractall(os.path.dirname(local_path)) elif local_path[-4:] == '.zip': import zipfile with zipfile.ZipFile(local_path, 'r') as zip: zip.extractall(os.path.dirname(local_path)) else: local_path = temp_path return local_path
def __setitem__(self, key, value): with timer('write %s' % key): with open(self._path(key), 'wb') as f: pickle.dump(value, f, pickle.HIGHEST_PROTOCOL) if self.limit is not None: if os.path.getsize(self._path(key)) > self.limit: raise MemoryError( 'disk cache limit exceeded by single key: %s' % key) with timer('evict %s' % key): while self.size() > self.limit: del self[self.lru()]
def reverse_transform(self, array): with timer('reverse_transform %s' % self.name, logging.DEBUG): data = pandas.DataFrame(array) for column in data: data[column] = super(Token, self).reverse_transform(data[column]) return data.T.apply(' '.join)
def load_dataframe(self, key, columns, log_verbose=False,): with timer('load_dataframe'): frames = [] for entry in lore.io.bucket.objects.filter( Prefix=os.path.join(self.UNLOAD_PREFIX, key) ): temp = tempfile.NamedTemporaryFile() lore.io.bucket.download_file(entry.key, temp.name) dataframe = pandas.read_csv( temp.name, delimiter='|', quotechar='"', compression='gzip', error_bad_lines=False ) dataframe.columns = columns frames.append(dataframe) result = pandas.concat(frames) result.columns = columns if log_verbose is True: buffer = io.StringIO() result.info(buf=buffer, memory_usage='deep') logger.info(buffer.getvalue()) logger.info(result.head()) return result
def load(self, fitting=None): if fitting: self.fitting = fitting with timer('unpickle model'): with open(self.model_path, 'rb') as f: self.model = pickle.load(f)
def fit(self, data): with timer(('fit %s' % self.name), logging.DEBUG): if self.correlation: if not isinstance(self.column, str): raise "Can not correlate with non native columns" self.map = MissingValueMap( data.groupby( self.column)[self.correlation].mean().to_dict()) self.missing_value = data[self.column].mean() else: if self.stratify: ids = pandas.DataFrame({ 'id': self.series(data), 'stratify': data[self.stratify] }).drop_duplicates() else: ids = pandas.DataFrame({'id': self.series(data)}) counts = pandas.DataFrame({'n': ids.groupby('id').size()}) qualified = counts[counts.n >= self.minimum_occurrences].copy() qualified['encoded_id'] = numpy.arange(len(qualified)) + 2 self.map = MissingValueMap(qualified.to_dict()['encoded_id']) self.missing_value = len(self.map) + 2 self.inverse = {v: k for k, v in self.map.items()} self.inverse[self.tail_value] = 'LONG_TAIL' self.dtype = self._type_from_cardinality()
def reverse_transform(self, array): with timer('reverse_transform %s' % self.name, logging.DEBUG): data = pandas.DataFrame(array) for column in data: data[column] = super(NestedUnique, self).reverse_transform(data[column]) return numpy.array(data)
def unnest(self, data, fit=False): """ :param data: a dataframe containing a column to be unnested :param fit: if True, self.sequence_length will exactly accomodate the largest sequence length :return: 1D array of values with length = rows * sequence_length """ with timer('unnest %s' % self.name, logging.DEBUG): raw = self.series(data) # lengths of every sequence lengths = [0 if x is None or (isinstance(x, float) and numpy.isnan(x)) else len(x) for x in raw.values] if fit and self.sequence_length is None: self.sequence_length = numpy.max(lengths) # Make them all the same size def fill_x(x, length): x_new = numpy.empty(length, dtype='float') x_new[:] = numpy.nan if x is None or (isinstance(x, float) and numpy.isnan(x)): return x_new fill_length = min(len(x), length) x_new[0:fill_length] = x[0:fill_length] return x_new same_size = [fill_x(x, self.sequence_length) for x in raw.values] # Flatten flattened = [item for sublist in same_size for item in sublist] return pandas.DataFrame({self.column: flattened})
def fit(self, data): with timer(('fit %s' % self.name), logging.DEBUG): series = self.series(data).astype(self.dtype) self.__min = float(series.min()) self.__max = float(series.max()) self.__mean = numpy.mean(series) self.__std = numpy.std(series)
def fit(self, data): with timer(('fit %s' % self.name), logging.DEBUG): if self.stratify: ids = pandas.DataFrame({ 'id': self.series(data), 'stratify': data[self.stratify], }).drop_duplicates() else: ids = pandas.DataFrame({'id': self.series(data)}) if self.correlation: ids['correlation'] = data[self.correlation] counts = pandas.DataFrame({'n': ids.groupby('id').size()}) if self.correlation: counts['correlation'] = ids.groupby('id')['correlation'].mean() counts = counts.sort_values('correlation') qualified = counts[counts.n >= self.minimum_occurrences].copy() qualified['encoded_id'] = numpy.arange(len(qualified)) + 2 self.map = MissingValueMap(qualified.to_dict()['encoded_id']) self.missing_value = len(self.map) + 2 self.inverse = {v: k for k, v in self.map.items()} self.inverse[self.tail_value] = 'LONG_TAIL' self.dtype = self._type_from_cardinality()
def encoded_validation_data(self): if not self._encoded_validation_data: with timer('encode validation data'): self._encoded_validation_data = self.observations( self.validation_data) return self._encoded_validation_data
def transform(self, data): with timer('transform %s' % (self.name), logging.DEBUG): series = self.series(data).astype(numpy.float16) null = series.isnull() series[series != 0] = 1 series[null] = self.missing_value return series.astype(numpy.uint8).values
def output_encoder(self): if self._output_encoder is None: with timer('fit output encoder'): self._output_encoder = self.get_output_encoder() self._output_encoder.fit(self.training_data) return self._output_encoder
def output_encoder(self): if self._output_encoder is None: with timer('fit output encoder'): self._output_encoder = self.get_output_encoder() self._output_encoder.fit(self.read_column(self.table_training, self._output_encoder.source_column)) return self._output_encoder
def save(self, stats=None): if self.fitting is None: raise ValueError( "This model has not been fit yet. There is no point in saving." ) if not os.path.exists(self.fitting_path()): try: os.makedirs(self.fitting_path()) except FileExistsError as ex: pass # race to create with timer('pickle model'): with open(self.model_path(), 'wb') as f: pickle.dump(self, f) with open(join(self.fitting_path(), 'params.json'), 'w') as f: params = {} for child in [self.estimator, self.pipeline]: param = child.__module__ + '.' + child.__class__.__name__ params[param] = {} if hasattr(child, '__getstate__'): state = child.__getstate__() else: state = child.__dict__ for key, value in state.items(): if not key.startswith('_'): params[param][key] = value.__repr__() json.dump(params, f, indent=2, sort_keys=True) if stats: with open(join(self.fitting_path(), 'stats.json'), 'w') as f: json.dump(stats, f, indent=2, sort_keys=True)
def transform(self, data): with timer('transform %s' % self.name, logging.DEBUG): series = self.series(data) null = series.isnull() series = numpy.log1p(numpy.maximum(series.fillna(0), 0)) series[null] = float('nan') return series
def transform(self, data): with timer('transform %s' % self.name, logging.DEBUG): series = self.series(data) other = self.other_series(data) if other is None: other = datetime.datetime.now() elif other.dtype != 'datetime64[ns]': logger.warning('%s is not a datetime. Converting to datetime64[ns]' % self.column) other = pandas.to_datetime(other).astype('datetime64[ns]') if series.dtype != 'datetime64[ns]': logger.warning('%s is not a datetime. Converting to datetime64[ns]' % self.column) series = pandas.to_datetime(series).astype('datetime64[ns]') age = (other - series) if self.unit in ['nanosecond', 'nanoseconds']: return age seconds = age.dt.total_seconds() if self.unit in ['second', 'seconds']: return seconds if self.unit in ['minute', 'minutes']: return seconds / 60 if self.unit in ['hour', 'hours']: return seconds / 3600 if self.unit in ['day', 'days']: return seconds / 86400 if self.unit in ['week', 'weeks']: return seconds / 604800 if self.unit in ['month', 'months']: return seconds / 2592000 if self.unit in ['year', 'years']: return seconds / 31536000 raise NameError('Unknown unit: %s' % self.unit)
def transform(self, data): with timer('transform %s' % self.name, logging.DEBUG): series = self.series(data) result = pandas.Series(series, copy=True) result[(series > self.__max) | (series < 0)] = self.unfit_value result[series.isnull()] = self.missing_value return result.astype(self.dtype).values
def encoded_training_data(self): if not self._encoded_training_data: with timer('encode training data'): self._encoded_training_data = self.observations( self.training_data) return self._encoded_training_data
def transform(self, data): with timer('transform one_hot %s:' % self.name, logging.DEBUG): dummies = self.get_dummies(data) for col in [ c for c in self.dummy_columns if c not in dummies.columns ]: dummies[col] = 0 return dummies[self.dummy_columns]
def on_train_end(self, logs=None): super(ReloadBest, self).on_train_end(logs) logger.info('=============================================') if self.best_epoch is not None: logger.debug('best epoch: %i' % self.best_epoch) with timer('load best epoch'): self.model.load_weights( self.filepath.format(epoch=self.best_epoch))
def _dataframe(self, sql, bindings): with timer("dataframe:"): if self._connection is None: self._connection = self._engine.connect() dataframe = pandas.read_sql(sql=sql, con=self._connection, params=bindings) return dataframe
def _dataframe(self, sql, bindings, chunksize=None): with timer("dataframe:"): if self._connection is None: self._connection = self._engine.connect() return pandas.read_sql(sql=sql, con=self._connection, params=bindings, chunksize=chunksize)
def encoders(self): if self._encoders is None: with timer('fit encoders'): self._encoders = self.get_encoders() for encoder in self._encoders: encoder.fit(self.read_column(self.table_training, encoder.source_column)) return self._encoders
def transform(self, data): """ :param data: DataFrame with column to encode :return: encoded Series """ with timer('transform %s' % self.name, logging.DEBUG): transformed = super(NestedNorm, self).transform(self.unnest(data)) return transformed.reshape((len(data), self.sequence_length))
def transform(self, data): with timer('transform %s' % (self.name), logging.DEBUG): series = self.series(data) cut = pandas.cut(series, bins=self.bins, labels=False, include_lowest=True) cut[series < self.lower_bound] = self.quantiles cut[series > self.upper_bound] = self.quantiles + 1 cut[series.isnull()] = self.missing_value return cut.astype(self.dtype).values
def replace(self, table, dataframe, batch_size=10**5): import migrate.changeset global _after_replace_callbacks with timer('REPLACE ' + table): suffix = datetime.now().strftime('_%Y%m%d%H%M%S').encode('utf-8') self.metadata temp = 'tmp_'.encode('utf-8') source = sqlalchemy.Table(table, self.metadata, autoload=True, autoload_with=self._engine) destination_name = 'tmp_' + hashlib.sha256( temp + table.encode('utf-8') + suffix).hexdigest()[0:56] destination = sqlalchemy.Table(destination_name, self.metadata, autoload=False) for column in source.columns: destination.append_column(column.copy()) destination.create() original_names = {} for index in source.indexes: # make sure the name is < 63 chars with the suffix name = hashlib.sha256(temp + index.name.encode('utf-8') + suffix).hexdigest()[0:60] original_names[name] = index.name columns = [] for column in index.columns: columns.append( next(x for x in destination.columns if x.name == column.name)) new = sqlalchemy.Index(name, *columns) new.unique = index.unique new.table = destination new.create(bind=self._connection) self.insert(destination.name, dataframe, batch_size=batch_size) self.execute( sql= "BEGIN; SET LOCAL statement_timeout = '1min'; ANALYZE %s; COMMIT;" % self.quote_identifier(table)) with self as transaction: backup = sqlalchemy.Table(table + '_b', self.metadata) backup.drop(bind=self._connection, checkfirst=True) source.rename(name=source.name + '_b', connection=self._connection) destination.rename(name=table, connection=self._connection) for index in source.indexes: index.rename(index.name[0:-2] + '_b', connection=self._connection) for index in destination.indexes: index.rename(original_names[index.name], connection=self._connection) for func in _after_replace_callbacks: func(destination, source)
def fit(self, data): with timer(('fit %s' % self.name), logging.DEBUG): series = self.series(data) self.__min = series.min() self.__range = series.max() - self.__min if isinstance(self.__range, timedelta): logger.warning('Discrete timedelta requires (slower) 64bit float math. ' 'Could you use the epoch instead for %s?' % self.name) self.__range = self.__range.total_seconds() * 1000000000
def fit(self, data): with timer(('fit %s' % self.name), logging.DEBUG): series = self.series(data) series_cut, self.bins = pandas.qcut(series, self.quantiles, retbins=True, labels=False, duplicates='drop') self.quantiles = len(self.bins) - 1 self.missing_value = self.quantiles + 2 self.lower_bound = series.min() self.upper_bound = series.max() self.dtype = self._type_from_cardinality()