コード例 #1
0
ファイル: predict.py プロジェクト: xyzhang97/ludwig
def calculate_overall_stats(test_stats, output_features, dataset,
                            train_set_metadata):
    for output_feature in output_features:
        feature = get_from_registry(output_feature['type'],
                                    output_type_registry)
        feature.calculate_overall_stats(test_stats, output_feature, dataset,
                                        train_set_metadata)
コード例 #2
0
ファイル: strings_utils.py プロジェクト: mrphu3074/ludwig
def build_sequence_matrix(sequences, inverse_vocabulary, format, length_limit,
                          padding_symbol, padding='right',
                          lowercase=True):
    tokenizer = get_from_registry(format, tokenizer_registry)()
    format_dtype = int_type(len(inverse_vocabulary) - 1)

    max_length = 0
    unit_vectors = []
    for sequence in sequences:
        unit_indices_vector = _get_sequence_vector(
            sequence,
            tokenizer,
            format_dtype,
            inverse_vocabulary,
            lowercase=lowercase
        )
        unit_vectors.append(unit_indices_vector)
        if len(unit_indices_vector) > max_length:
            max_length = len(unit_indices_vector)

    if max_length < length_limit:
        logging.debug('max length of {0}: {1} < limit: {2}'.format(
            format, max_length, length_limit
        ))
    max_length = length_limit
    sequence_matrix = np.full((len(sequences), max_length),
                              inverse_vocabulary[padding_symbol],
                              dtype=format_dtype)
    for i, vector in enumerate(unit_vectors):
        limit = min(vector.shape[0], max_length)
        if padding == 'right':
            sequence_matrix[i, :limit] = vector[:limit]
        else:  # if padding == 'left
            sequence_matrix[i, max_length - limit:] = vector[:limit]
    return sequence_matrix
コード例 #3
0
ファイル: strings_utils.py プロジェクト: mrphu3074/ludwig
def create_vocabulary(data, format='space', custom_vocabulary=(),
                      add_unknown=True, add_padding=True,
                      lowercase=True,
                      num_most_frequent=None):
    max_line_length = 0
    unit_counts = Counter()

    if format == 'custom':
        vocab = sorted(list(set(custom_vocabulary)))
    else:
        tokenizer = get_from_registry(format, tokenizer_registry)()
        for line in data:
            processed_line = tokenizer(line.lower() if lowercase else line)
            unit_counts.update(processed_line)
            max_line_length = max(max_line_length, len(processed_line))

        vocab = [unit for unit, count in
                 unit_counts.most_common(num_most_frequent)]

    if add_unknown:
        vocab = [UNKNOWN_SYMBOL] + vocab
    if add_padding:
        vocab = [PADDING_SYMBOL] + vocab

    str2idx = {unit: i for i, unit in enumerate(vocab)}
    str2freq = {unit: unit_counts.get(unit) if unit in unit_counts else 0 for
                unit in vocab}

    return vocab, str2idx, str2freq, max_line_length
コード例 #4
0
ファイル: image_feature.py プロジェクト: zxyf1/ludwig
    def build_input(self,
                    regularizer,
                    dropout_rate,
                    is_training=False,
                    **kwargs):
        placeholder = self._get_input_placeholder()
        logger.debug('  placeholder: {0}'.format(placeholder))

        scaled = get_from_registry(self.scaling,
                                   image_scaling_registry)(placeholder)
        logger.debug('  scaled: {0}'.format(scaled))

        feature_representation, feature_representation_size = self.encoder_obj(
            placeholder,
            regularizer,
            dropout_rate,
            is_training,
        )
        logger.debug(
            '  feature_representation: {0}'.format(feature_representation))

        feature_representation = {
            'name': self.name,
            'type': self.type,
            'representation': feature_representation,
            'size': feature_representation_size,
            'placeholder': placeholder
        }
        return feature_representation
コード例 #5
0
def get_initializer(parameters):
    if parameters is None:
        return initializers_registry[parameters]()
    elif isinstance(parameters, str):
        initializer_fun = get_from_registry(parameters, initializers_registry)
        return initializer_fun()
    elif isinstance(parameters, dict):
        initializer_fun = get_from_registry(parameters['type'],
                                            initializers_registry)
        arguments = parameters.copy()
        del arguments['type']
        return initializer_fun(**arguments)
    else:
        raise ValueError(
            'Initializers parameters should be either strings or dictionaries, '
            'but the provided parameters are a {}. '
            'Parameters values: {}'.format(type(parameters), parameters))
コード例 #6
0
def get_sequence_vector(sequence, format, unit_to_id, lowercase=True):
    format_function = get_from_registry(format, format_registry)
    format_dtype = int_type(len(unit_to_id) - 1)
    return _get_sequence_vector(sequence,
                                format_function,
                                format_dtype,
                                unit_to_id,
                                lowercase=lowercase)
コード例 #7
0
def get_sequence_vector(sequence, tokenizer_type, unit_to_id, lowercase=True):
    tokenizer = get_from_registry(tokenizer_type, tokenizer_registry)()
    format_dtype = int_type(len(unit_to_id) - 1)
    return _get_sequence_vector(sequence,
                                tokenizer,
                                format_dtype,
                                unit_to_id,
                                lowercase=lowercase)
コード例 #8
0
def build_feature_parameters(features):
    feature_parameters = {}
    for feature in features:
        fearure_builder_function = get_from_registry(
            feature['type'], parameters_builders_registry)

        feature_parameters[feature['name']] = fearure_builder_function(feature)
    return feature_parameters
コード例 #9
0
 def _determine_samples(self):
     samples = []
     for _ in range(self.num_samples):
         sample = {}
         for hp_name, hp_params in self.parameters.items():
             sampling_function = get_from_registry(
                 hp_params['type'], sampling_functions_registry)
             sample[hp_name] = sampling_function(**hp_params)
         samples.append(sample)
     return samples
コード例 #10
0
    def __init__(self,
                 reduce_output=None,
                 main_sequence_feature=None,
                 encoder=None,
                 **kwargs):
        self.combiner = SequenceConcatCombiner(
            reduce_output=reduce_output,
            main_sequence_feature=main_sequence_feature)

        self.encoder_obj = get_from_registry(
            encoder, sequence_encoder_registry)(should_embed=False, **kwargs)
コード例 #11
0
    def get_feature_meta(column, preprocessing_parameters):
        format_function = get_from_registry(preprocessing_parameters['format'],
                                            format_registry)
        max_length = 0
        for timeseries in column:
            processed_line = format_function(timeseries)
            max_length = max(max_length, len(processed_line))
        max_length = min(preprocessing_parameters['timeseries_length_limit'],
                         max_length)

        return {'max_timeseries_length': max_length}
コード例 #12
0
ファイル: timeseries_feature.py プロジェクト: sriki18/ludwig
    def get_feature_meta(column, preprocessing_parameters):
        tokenizer = get_from_registry(preprocessing_parameters['tokenizer'],
                                      tokenizer_registry)()
        max_length = 0
        for timeseries in column:
            processed_line = tokenizer(timeseries)
            max_length = max(max_length, len(processed_line))
        max_length = min(preprocessing_parameters['timeseries_length_limit'],
                         max_length)

        return {'max_timeseries_length': max_length}
コード例 #13
0
def postprocess_results(result,
                        output_feature,
                        metadata,
                        experiment_dir_name='',
                        skip_save_unprocessed_output=False):
    feature = get_from_registry(output_feature['type'], output_type_registry)
    return feature.postprocess_results(
        output_feature,
        result,
        metadata,
        experiment_dir_name,
        skip_save_unprocessed_output=skip_save_unprocessed_output)
コード例 #14
0
def generate_datapoint(features):
    datapoint = []
    for feature in features:
        if ('cycle' in feature and feature['cycle'] == True
                and feature['type'] in cyclers_registry):
            cycler_function = cyclers_registry[feature['type']]
            feature_value = cycler_function(feature)
        else:
            generator_function = get_from_registry(feature['type'],
                                                   generators_registry)
            feature_value = generator_function(feature)
        datapoint.append(feature_value)
    return datapoint
コード例 #15
0
def reduce_sequence_list(sequence_list, mode):
    reduce_mode = get_from_registry(mode, reduce_mode_registry)
    reduced_list = []
    for sequence in sequence_list:
        reduced_list.append(reduce_mode(sequence))
    if len(reduced_list) > 1:
        if reduce_mode == dont_reduce:
            reduced_output = tf.concat(reduced_list, 2)
        else:
            reduced_output = tf.concat(reduced_list, 1)
    else:
        reduced_output = reduced_list[0]
    return reduced_output
コード例 #16
0
def create_vocabulary(data,
                      tokenizer_type='space',
                      add_unknown=True,
                      add_padding=True,
                      lowercase=True,
                      num_most_frequent=None,
                      vocab_file=None,
                      unknown_symbol=UNKNOWN_SYMBOL,
                      padding_symbol=PADDING_SYMBOL):
    vocab = None
    max_line_length = 0
    unit_counts = Counter()

    if tokenizer_type == 'bert':
        vocab = load_vocabulary(vocab_file)
        add_unknown = False
        add_padding = False
    elif vocab_file is not None:
        vocab = load_vocabulary(vocab_file)

    tokenizer = get_from_registry(tokenizer_type,
                                  tokenizer_registry)(vocab_file=vocab_file)
    for line in data:
        processed_line = tokenizer(line.lower() if lowercase else line)
        unit_counts.update(processed_line)
        max_line_length = max(max_line_length, len(processed_line))

    if vocab is None:
        vocab = [
            unit for unit, count in unit_counts.most_common(num_most_frequent)
        ]

    vocab_set = set(vocab)
    if add_unknown:
        if unknown_symbol in vocab_set:
            vocab.remove(unknown_symbol)
        vocab = [unknown_symbol] + vocab
    if add_padding:
        if padding_symbol in vocab_set:
            vocab.remove(padding_symbol)
        vocab = [padding_symbol] + vocab

    str2idx = {unit: i for i, unit in enumerate(vocab)}
    str2freq = {
        unit: unit_counts.get(unit) if unit in unit_counts else 0
        for unit in vocab
    }

    return vocab, str2idx, str2freq, max_line_length
コード例 #17
0
ファイル: preprocessing.py プロジェクト: magiciiboy/ludwig
def build_metadata(dataset_df, features, global_preprocessing_parameters):
    train_set_metadata = {}
    for feature in features:
        get_feature_meta = get_from_registry(
            feature['type'], base_type_registry).get_feature_meta
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature['type']],
                feature['preprocessing'])
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature['type']]
        train_set_metadata[feature['name']] = get_feature_meta(
            dataset_df[feature['name']].astype(str), preprocessing_parameters)
    return train_set_metadata
コード例 #18
0
ファイル: preprocessing.py プロジェクト: magiciiboy/ludwig
def build_data(dataset_df, features, train_set_metadata,
               global_preprocessing_parameters):
    data = {}
    for feature in features:
        add_feature_data = get_from_registry(
            feature['type'], base_type_registry).add_feature_data
        if 'preprocessing' in feature:
            preprocessing_parameters = merge_dict(
                global_preprocessing_parameters[feature['type']],
                feature['preprocessing'])
        else:
            preprocessing_parameters = global_preprocessing_parameters[
                feature['type']]
        handle_missing_values(dataset_df, feature, preprocessing_parameters)
        if feature['name'] not in train_set_metadata:
            train_set_metadata[feature['name']] = {}
        train_set_metadata[
            feature['name']]['preprocessing'] = preprocessing_parameters
        add_feature_data(feature, dataset_df, data, train_set_metadata,
                         preprocessing_parameters)
    return data
コード例 #19
0
    def build_matrix(
            timeseries,
            format_str,
            length_limit,
            padding_value,
            padding='right'
    ):
        tokenizer = get_from_registry(
            format_str,
            tokenizer_registry
        )()
        max_length = 0
        ts_vectors = []
        for ts in timeseries:
            ts_vector = np.array(tokenizer(ts)).astype(np.float32)
            ts_vectors.append(ts_vector)
            if len(ts_vector) > max_length:
                max_length = len(ts_vector)

        if max_length < length_limit:
            logger.debug(
                'max length of {0}: {1} < limit: {2}'.format(
                    format_str,
                    max_length,
                    length_limit
                )
            )
        max_length = length_limit
        timeseries_matrix = np.full(
            (len(timeseries), max_length),
            padding_value,
            dtype=np.float32
        )
        for i, vector in enumerate(ts_vectors):
            limit = min(vector.shape[0], max_length)
            if padding == 'right':
                timeseries_matrix[i, :limit] = vector[:limit]
            else:  # if padding == 'left
                timeseries_matrix[i, max_length - limit:] = vector[:limit]
        return timeseries_matrix
コード例 #20
0
def build_single_output(output_feature,
                        feature_hidden,
                        feature_hidden_size,
                        final_hidden,
                        dropout_rate,
                        regularizer,
                        is_training=True,
                        **kwargs):
    logger.debug('- Output {} feature {}'.format(output_feature['type'],
                                                 output_feature['name']))
    with tf.variable_scope(output_feature['name']):
        feature_class = get_from_registry(output_feature['type'],
                                          output_type_registry)
        feature = feature_class(output_feature)
        weighted_train_mean_loss, weighted_eval_loss, output_tensors = feature.concat_dependencies_and_build_output(
            feature_hidden,
            feature_hidden_size,
            final_hidden,
            dropout_rate=dropout_rate,
            regularizer=regularizer,
            is_training=is_training,
            **kwargs)
    return weighted_train_mean_loss, weighted_eval_loss, output_tensors
コード例 #21
0
def build_single_input(input_feature,
                       regularizer,
                       dropout_rate,
                       is_training=True,
                       **kwargs):
    scope_name = input_feature['name']
    logging.debug('- Input {} feature {}'.format(input_feature['type'],
                                                 input_feature['name']))
    if input_feature['tied_weights'] is not None:
        scope_name = input_feature['tied_weights']

    with tf.variable_scope(scope_name, reuse=tf.AUTO_REUSE):
        input_feature_class = get_from_registry(input_feature['type'],
                                                input_type_registry)
        input_feature_obj = input_feature_class(input_feature)
        feature_representation = input_feature_obj.build_input(
            regularizer=regularizer,
            dropout_rate=dropout_rate,
            is_training=is_training,
            **kwargs)
        feature_representation['representation'] = tf.identity(
            feature_representation['representation'], name=scope_name)
    return feature_representation
コード例 #22
0
def get_build_hyperopt_executor(executor_type):
    return get_from_registry(executor_type, executor_registry)
コード例 #23
0
def get_build_combiner(combiner_type):
    return get_from_registry(combiner_type, combiner_registry)
コード例 #24
0
 def get_date_encoder(self, encoder_parameters):
     return get_from_registry(self.encoder,
                              date_encoder_registry)(**encoder_parameters)
コード例 #25
0
def reduce_sequence(sequence, mode):
    reduce_mode = get_from_registry(mode, reduce_mode_registry)
    return reduce_mode(sequence)
コード例 #26
0
ファイル: vector_feature.py プロジェクト: sriki18/ludwig
 def get_vector_decoder(self, decoder_parameters):
     return get_from_registry(self.decoder,
                              vector_decoder_registry)(**decoder_parameters)
コード例 #27
0
ファイル: preprocessing.py プロジェクト: magiciiboy/ludwig
def get_dataset_fun(dataset_type):
    return get_from_registry(dataset_type, dataset_type_registry)
コード例 #28
0
 def get_sequence_encoder(self, encoder_parameters):
     return get_from_registry(
         self.encoder, sequence_encoder_registry)(**encoder_parameters)
コード例 #29
0
 def get_sequence_decoder(self, decoder_parameters):
     return get_from_registry(
         self.decoder, sequence_decoder_registry)(**decoder_parameters)
コード例 #30
0
ファイル: vector_feature.py プロジェクト: zxyf1/ludwig
 def get_vector_encoder(self, encoder_parameters):
     return get_from_registry(self.encoder,
                              vector_encoder_registry)(**encoder_parameters)