def calculate_overall_stats(test_stats, output_features, dataset, train_set_metadata): for output_feature in output_features: feature = get_from_registry(output_feature['type'], output_type_registry) feature.calculate_overall_stats(test_stats, output_feature, dataset, train_set_metadata)
def build_sequence_matrix(sequences, inverse_vocabulary, format, length_limit, padding_symbol, padding='right', lowercase=True): tokenizer = get_from_registry(format, tokenizer_registry)() format_dtype = int_type(len(inverse_vocabulary) - 1) max_length = 0 unit_vectors = [] for sequence in sequences: unit_indices_vector = _get_sequence_vector( sequence, tokenizer, format_dtype, inverse_vocabulary, lowercase=lowercase ) unit_vectors.append(unit_indices_vector) if len(unit_indices_vector) > max_length: max_length = len(unit_indices_vector) if max_length < length_limit: logging.debug('max length of {0}: {1} < limit: {2}'.format( format, max_length, length_limit )) max_length = length_limit sequence_matrix = np.full((len(sequences), max_length), inverse_vocabulary[padding_symbol], dtype=format_dtype) for i, vector in enumerate(unit_vectors): limit = min(vector.shape[0], max_length) if padding == 'right': sequence_matrix[i, :limit] = vector[:limit] else: # if padding == 'left sequence_matrix[i, max_length - limit:] = vector[:limit] return sequence_matrix
def create_vocabulary(data, format='space', custom_vocabulary=(), add_unknown=True, add_padding=True, lowercase=True, num_most_frequent=None): max_line_length = 0 unit_counts = Counter() if format == 'custom': vocab = sorted(list(set(custom_vocabulary))) else: tokenizer = get_from_registry(format, tokenizer_registry)() for line in data: processed_line = tokenizer(line.lower() if lowercase else line) unit_counts.update(processed_line) max_line_length = max(max_line_length, len(processed_line)) vocab = [unit for unit, count in unit_counts.most_common(num_most_frequent)] if add_unknown: vocab = [UNKNOWN_SYMBOL] + vocab if add_padding: vocab = [PADDING_SYMBOL] + vocab str2idx = {unit: i for i, unit in enumerate(vocab)} str2freq = {unit: unit_counts.get(unit) if unit in unit_counts else 0 for unit in vocab} return vocab, str2idx, str2freq, max_line_length
def build_input(self, regularizer, dropout_rate, is_training=False, **kwargs): placeholder = self._get_input_placeholder() logger.debug(' placeholder: {0}'.format(placeholder)) scaled = get_from_registry(self.scaling, image_scaling_registry)(placeholder) logger.debug(' scaled: {0}'.format(scaled)) feature_representation, feature_representation_size = self.encoder_obj( placeholder, regularizer, dropout_rate, is_training, ) logger.debug( ' feature_representation: {0}'.format(feature_representation)) feature_representation = { 'name': self.name, 'type': self.type, 'representation': feature_representation, 'size': feature_representation_size, 'placeholder': placeholder } return feature_representation
def get_initializer(parameters): if parameters is None: return initializers_registry[parameters]() elif isinstance(parameters, str): initializer_fun = get_from_registry(parameters, initializers_registry) return initializer_fun() elif isinstance(parameters, dict): initializer_fun = get_from_registry(parameters['type'], initializers_registry) arguments = parameters.copy() del arguments['type'] return initializer_fun(**arguments) else: raise ValueError( 'Initializers parameters should be either strings or dictionaries, ' 'but the provided parameters are a {}. ' 'Parameters values: {}'.format(type(parameters), parameters))
def get_sequence_vector(sequence, format, unit_to_id, lowercase=True): format_function = get_from_registry(format, format_registry) format_dtype = int_type(len(unit_to_id) - 1) return _get_sequence_vector(sequence, format_function, format_dtype, unit_to_id, lowercase=lowercase)
def get_sequence_vector(sequence, tokenizer_type, unit_to_id, lowercase=True): tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)() format_dtype = int_type(len(unit_to_id) - 1) return _get_sequence_vector(sequence, tokenizer, format_dtype, unit_to_id, lowercase=lowercase)
def build_feature_parameters(features): feature_parameters = {} for feature in features: fearure_builder_function = get_from_registry( feature['type'], parameters_builders_registry) feature_parameters[feature['name']] = fearure_builder_function(feature) return feature_parameters
def _determine_samples(self): samples = [] for _ in range(self.num_samples): sample = {} for hp_name, hp_params in self.parameters.items(): sampling_function = get_from_registry( hp_params['type'], sampling_functions_registry) sample[hp_name] = sampling_function(**hp_params) samples.append(sample) return samples
def __init__(self, reduce_output=None, main_sequence_feature=None, encoder=None, **kwargs): self.combiner = SequenceConcatCombiner( reduce_output=reduce_output, main_sequence_feature=main_sequence_feature) self.encoder_obj = get_from_registry( encoder, sequence_encoder_registry)(should_embed=False, **kwargs)
def get_feature_meta(column, preprocessing_parameters): format_function = get_from_registry(preprocessing_parameters['format'], format_registry) max_length = 0 for timeseries in column: processed_line = format_function(timeseries) max_length = max(max_length, len(processed_line)) max_length = min(preprocessing_parameters['timeseries_length_limit'], max_length) return {'max_timeseries_length': max_length}
def get_feature_meta(column, preprocessing_parameters): tokenizer = get_from_registry(preprocessing_parameters['tokenizer'], tokenizer_registry)() max_length = 0 for timeseries in column: processed_line = tokenizer(timeseries) max_length = max(max_length, len(processed_line)) max_length = min(preprocessing_parameters['timeseries_length_limit'], max_length) return {'max_timeseries_length': max_length}
def postprocess_results(result, output_feature, metadata, experiment_dir_name='', skip_save_unprocessed_output=False): feature = get_from_registry(output_feature['type'], output_type_registry) return feature.postprocess_results( output_feature, result, metadata, experiment_dir_name, skip_save_unprocessed_output=skip_save_unprocessed_output)
def generate_datapoint(features): datapoint = [] for feature in features: if ('cycle' in feature and feature['cycle'] == True and feature['type'] in cyclers_registry): cycler_function = cyclers_registry[feature['type']] feature_value = cycler_function(feature) else: generator_function = get_from_registry(feature['type'], generators_registry) feature_value = generator_function(feature) datapoint.append(feature_value) return datapoint
def reduce_sequence_list(sequence_list, mode): reduce_mode = get_from_registry(mode, reduce_mode_registry) reduced_list = [] for sequence in sequence_list: reduced_list.append(reduce_mode(sequence)) if len(reduced_list) > 1: if reduce_mode == dont_reduce: reduced_output = tf.concat(reduced_list, 2) else: reduced_output = tf.concat(reduced_list, 1) else: reduced_output = reduced_list[0] return reduced_output
def create_vocabulary(data, tokenizer_type='space', add_unknown=True, add_padding=True, lowercase=True, num_most_frequent=None, vocab_file=None, unknown_symbol=UNKNOWN_SYMBOL, padding_symbol=PADDING_SYMBOL): vocab = None max_line_length = 0 unit_counts = Counter() if tokenizer_type == 'bert': vocab = load_vocabulary(vocab_file) add_unknown = False add_padding = False elif vocab_file is not None: vocab = load_vocabulary(vocab_file) tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)(vocab_file=vocab_file) for line in data: processed_line = tokenizer(line.lower() if lowercase else line) unit_counts.update(processed_line) max_line_length = max(max_line_length, len(processed_line)) if vocab is None: vocab = [ unit for unit, count in unit_counts.most_common(num_most_frequent) ] vocab_set = set(vocab) if add_unknown: if unknown_symbol in vocab_set: vocab.remove(unknown_symbol) vocab = [unknown_symbol] + vocab if add_padding: if padding_symbol in vocab_set: vocab.remove(padding_symbol) vocab = [padding_symbol] + vocab str2idx = {unit: i for i, unit in enumerate(vocab)} str2freq = { unit: unit_counts.get(unit) if unit in unit_counts else 0 for unit in vocab } return vocab, str2idx, str2freq, max_line_length
def build_metadata(dataset_df, features, global_preprocessing_parameters): train_set_metadata = {} for feature in features: get_feature_meta = get_from_registry( feature['type'], base_type_registry).get_feature_meta if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature['type']], feature['preprocessing']) else: preprocessing_parameters = global_preprocessing_parameters[ feature['type']] train_set_metadata[feature['name']] = get_feature_meta( dataset_df[feature['name']].astype(str), preprocessing_parameters) return train_set_metadata
def build_data(dataset_df, features, train_set_metadata, global_preprocessing_parameters): data = {} for feature in features: add_feature_data = get_from_registry( feature['type'], base_type_registry).add_feature_data if 'preprocessing' in feature: preprocessing_parameters = merge_dict( global_preprocessing_parameters[feature['type']], feature['preprocessing']) else: preprocessing_parameters = global_preprocessing_parameters[ feature['type']] handle_missing_values(dataset_df, feature, preprocessing_parameters) if feature['name'] not in train_set_metadata: train_set_metadata[feature['name']] = {} train_set_metadata[ feature['name']]['preprocessing'] = preprocessing_parameters add_feature_data(feature, dataset_df, data, train_set_metadata, preprocessing_parameters) return data
def build_matrix( timeseries, format_str, length_limit, padding_value, padding='right' ): tokenizer = get_from_registry( format_str, tokenizer_registry )() max_length = 0 ts_vectors = [] for ts in timeseries: ts_vector = np.array(tokenizer(ts)).astype(np.float32) ts_vectors.append(ts_vector) if len(ts_vector) > max_length: max_length = len(ts_vector) if max_length < length_limit: logger.debug( 'max length of {0}: {1} < limit: {2}'.format( format_str, max_length, length_limit ) ) max_length = length_limit timeseries_matrix = np.full( (len(timeseries), max_length), padding_value, dtype=np.float32 ) for i, vector in enumerate(ts_vectors): limit = min(vector.shape[0], max_length) if padding == 'right': timeseries_matrix[i, :limit] = vector[:limit] else: # if padding == 'left timeseries_matrix[i, max_length - limit:] = vector[:limit] return timeseries_matrix
def build_single_output(output_feature, feature_hidden, feature_hidden_size, final_hidden, dropout_rate, regularizer, is_training=True, **kwargs): logger.debug('- Output {} feature {}'.format(output_feature['type'], output_feature['name'])) with tf.variable_scope(output_feature['name']): feature_class = get_from_registry(output_feature['type'], output_type_registry) feature = feature_class(output_feature) weighted_train_mean_loss, weighted_eval_loss, output_tensors = feature.concat_dependencies_and_build_output( feature_hidden, feature_hidden_size, final_hidden, dropout_rate=dropout_rate, regularizer=regularizer, is_training=is_training, **kwargs) return weighted_train_mean_loss, weighted_eval_loss, output_tensors
def build_single_input(input_feature, regularizer, dropout_rate, is_training=True, **kwargs): scope_name = input_feature['name'] logging.debug('- Input {} feature {}'.format(input_feature['type'], input_feature['name'])) if input_feature['tied_weights'] is not None: scope_name = input_feature['tied_weights'] with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE): input_feature_class = get_from_registry(input_feature['type'], input_type_registry) input_feature_obj = input_feature_class(input_feature) feature_representation = input_feature_obj.build_input( regularizer=regularizer, dropout_rate=dropout_rate, is_training=is_training, **kwargs) feature_representation['representation'] = tf.identity( feature_representation['representation'], name=scope_name) return feature_representation
def get_build_hyperopt_executor(executor_type): return get_from_registry(executor_type, executor_registry)
def get_build_combiner(combiner_type): return get_from_registry(combiner_type, combiner_registry)
def get_date_encoder(self, encoder_parameters): return get_from_registry(self.encoder, date_encoder_registry)(**encoder_parameters)
def reduce_sequence(sequence, mode): reduce_mode = get_from_registry(mode, reduce_mode_registry) return reduce_mode(sequence)
def get_vector_decoder(self, decoder_parameters): return get_from_registry(self.decoder, vector_decoder_registry)(**decoder_parameters)
def get_dataset_fun(dataset_type): return get_from_registry(dataset_type, dataset_type_registry)
def get_sequence_encoder(self, encoder_parameters): return get_from_registry( self.encoder, sequence_encoder_registry)(**encoder_parameters)
def get_sequence_decoder(self, decoder_parameters): return get_from_registry( self.decoder, sequence_decoder_registry)(**decoder_parameters)
def get_vector_encoder(self, encoder_parameters): return get_from_registry(self.encoder, vector_encoder_registry)(**encoder_parameters)