Ejemplo n.º 1
0
    def __init__(self, model_weight='mn_model_weight.h5', scale_ratio=1):
        self.scale_ratio = scale_ratio
        self.characters = sorted([
            *set("".join(
                sum(ArtsInfo.ArtNames, []) + ArtsInfo.TypeNames +
                list(ArtsInfo.MainAttrNames.values()) +
                list(ArtsInfo.SubAttrNames.values()) + list(".,+%0123456789")))
        ])
        # Mapping characters to integers
        self.char_to_num = StringLookup(vocabulary=list(self.characters),
                                        num_oov_indices=0,
                                        mask_token="")

        # Mapping integers back to original characters
        self.num_to_char = StringLookup(
            vocabulary=self.char_to_num.get_vocabulary(),
            oov_token="",
            mask_token="",
            invert=True)

        self.width = 240
        self.height = 16
        self.max_length = 15
        self.build_model(input_shape=(self.width, self.height))
        self.model.load_weights(model_weight)
Ejemplo n.º 2
0
def encode_string_categorical_feature(feature, name, dataset):
    # Create a StringLookup layer which will turn strings into integer indices
    index = StringLookup()

    # Prepare a Dataset that only yields our feature
    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    # Learn the set of possible string values and assign them a fixed integer index
    index.adapt(feature_ds)

    # Turn the string input into integer indices
    encoded_feature = index(feature)

    # Create a CategoryEncoding for our integer indices
    encoder = CategoryEncoding(output_mode="binary")

    # Prepare a dataset of indices
    feature_ds = feature_ds.map(index)

    # Learn the space of possible indices
    encoder.adapt(feature_ds)

    # Apply one-hot encoding to our indices
    encoded_feature = encoder(encoded_feature)
    return encoded_feature
Ejemplo n.º 3
0
    def build(self, input_shape=None):
        self.squeeze = False
        if 2 == len(input_shape):
            if 1 != input_shape[-1]:
                raise ValueError(
                    'Input 0 of layer {} is incompatible with the layer: if ndim=2 expected axis[-1]=1, found '
                    'axis[-1]={}. Full shape received: {}'.format(self.name, input_shape[-1], input_shape))

            self.squeeze = True
            input_shape = input_shape[:1]

        self.lookup = StringLookup(vocabulary=self._vocabulary, mask_token=None, oov_token=self.UNK_MARK)
        self.lookup.build(input_shape)

        if 'adapt' == self.embed_type:
            self.embed = AdaptiveEmbedding(
                self.adapt_cutoff, self.lookup.vocabulary_size(), self.output_dim, factor=self.adapt_factor,
                embeddings_initializer=self.embeddings_initializer)
        else:
            self.embed = layers.Embedding(
                self.lookup.vocabulary_size(), self.output_dim, embeddings_initializer=self.embeddings_initializer)
            if 'dense_auto' == self.embed_type:
                self.embed.build(input_shape)
            else:  # 'dense_cpu' == self.embed_type
                with tf.device('cpu:0'):
                    self.embed.build(input_shape)

        super().build(input_shape)
Ejemplo n.º 4
0
 def _category_lookup(self, params: dict):
     key, input_layer = self._get_input_layer(params)
     num_oov_buckets = params.get('num_oov_buckets', 0)
     if input_layer.dtype == 'string':
         if 'vocabulary_file' in params.keys():
             return StringLookup(max_tokens=params['vocabulary_size'],
                                 num_oov_indices=num_oov_buckets,
                                 mask_token=None,
                                 vocabulary=params['vocabulary_file'])(input_layer)
         elif 'vocabulary_list' in params.keys():
             return StringLookup(max_tokens=len(params['vocabulary_list']) + num_oov_buckets,
                                 num_oov_indices=num_oov_buckets,
                                 mask_token=None,
                                 vocabulary=params['vocabulary_list'])(input_layer)
     else:
         if 'vocabulary_file' in params.keys():
             return IntegerLookup(max_values=params['vocabulary_size'] + num_oov_buckets,
                                  num_oov_indices=num_oov_buckets,
                                  mask_value=None,
                                  vocabulary=['vocabulary_file'])(input_layer)
         elif 'vocabulary_list' in params.keys():
             return IntegerLookup(max_values=len(params['vocabulary_list']) + num_oov_buckets,
                                  num_oov_indices=num_oov_buckets,
                                  mask_value=None,
                                  vocabulary=params['vocabulary_list'])(input_layer)
Ejemplo n.º 5
0
 def __init__(self, log_dir):
     self.log_dir = log_dir
     self.START_TOKEN = '[SOS]'
     self.END_TOKEN = '[EOS]'
     self.vocab = list(sorted(set(string.printable))) + [self.START_TOKEN, self.END_TOKEN]
     self.chars_to_ids = StringLookup(vocabulary=self.vocab)
     self.vocab_size = self.chars_to_ids.vocab_size()
def encode_inputs(inputs):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert a string values to an integer indices.
            # Since we are not using a mask token, nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and num_oov_indices to 0.
            lookup = StringLookup(vocabulary=vocabulary,
                                  mask_token=None,
                                  num_oov_indices=0)
            # Convert the string input values into integer indices.
            value_index = lookup(inputs[feature_name])
            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
            # Create an embedding layer with the specified dimensions.
            embedding = layers.Embedding(input_dim=lookup.vocabulary_size(),
                                         output_dim=embedding_dims)
            # Convert the index values to embedding representations.
            encoded_feature = embedding(value_index)
        else:
            # Use the numerical features as-is.
            encoded_feature = inputs[feature_name]
            if inputs[feature_name].shape[-1] is None:
                encoded_feature = tf.expand_dims(encoded_feature, -1)

        encoded_features.append(encoded_feature)

    encoded_features = layers.concatenate(encoded_features)
    return encoded_features
Ejemplo n.º 7
0
    def load_data(self):
        data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8')

        # Get a list of the unique characters in the text
        vocab = list(sorted(set(data)))
        vocab_size = len(vocab)

        chars_to_ids = StringLookup(vocabulary=vocab)
        self.ids_to_chars_layer = StringLookup(
            vocabulary=chars_to_ids.get_vocabulary(), invert=True)

        # Split the entire text by character
        chars = unicode_split(data, 'UTF-8')
        ids_of_chars = chars_to_ids(chars)

        # Group characters to form sequences (+1 since the targets are shifted by one)
        sequences_ds = Dataset.from_tensor_slices(ids_of_chars)
        sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1)

        # Batch the sequences
        ds = sequences_ds.padded_batch(C.BATCH_SIZE)
        ds = ds.map(self._to_inputs_and_targets,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds = ds.shuffle(C.BUFFER_SIZE)
        ds = ds.prefetch(tf.data.experimental.AUTOTUNE)

        return ds
Ejemplo n.º 8
0
def load_data():
    data = pd.read_csv(
        "https://storage.googleapis.com/tf-datasets/titanic/train.csv")
    from sklearn.model_selection import train_test_split
    labels = data.pop('survived')
    label_names = ["Not survived", "Survived"]
    features = {}

    # Converting CSV file into Tensorflow object

    for name, column in data.items():
        dtype = column.dtype
        if dtype == object:
            dtype = string
        else:
            dtype = float32
        features[name] = Input(shape=(1, ), name=name, dtype=dtype)

    # Extracting and normalizing numeric features
    numeric_features = {
        name: feature
        for name, feature in features.items() if feature.dtype == float32
    }

    x = Concatenate()(list(numeric_features.values()))
    norm = Normalization()
    norm.adapt(np.array(data[numeric_features.keys()]))
    numeric_features = norm(x)

    processed_features = [numeric_features]
    # Extracting and normalizing non-numeric features

    for name, feature in features.items():
        if feature.dtype == float32:
            continue
        word = StringLookup(vocabulary=np.unique(data[name]))
        one_hot = CategoryEncoding(max_tokens=word.vocab_size())

        x = word(feature)
        x = one_hot(x)
        processed_features.append(x)

    processed_features = Concatenate()(processed_features)
    processed_features = Model(features, processed_features)

    utils.plot_model(model=processed_features,
                     rankdir='LR',
                     dpi=72,
                     show_shapes=True)

    feature_dict = {name: np.array(value) for name, value in data.items()}

    train_features, test_features, train_labels, test_labels = train_test_split(
        processed_features(feature_dict).numpy(), labels, test_size=0.2)
    return train_features, train_labels, test_features, test_labels
Ejemplo n.º 9
0
    def build(self, input_shape):
        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST or self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            category_vocab = [
                'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Me', 'Mc', 'Nd', 'Nl',
                'No', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Co', 'Cs', 'Pd', 'Ps',
                'Pe', 'Pc', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Pi', 'Pf'
            ]
            self.cat_lookup = StringLookup(num_oov_indices=0,
                                           oov_token='Cn',
                                           vocabulary=category_vocab)
            if self.cat_lookup.vocab_size() != 30:
                raise ValueError('Wrong vocabulary size')

        super(WordShape, self).build(input_shape)
Ejemplo n.º 10
0
def encode_string_categorical_feature(feature, name, dataset):
    index = StringLookup()

    feature_ds = dataset.map(lambda x, y: x[name])
    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

    index.adapt(feature_ds)

    encoded_feature = index(feature)
    encoder = CategoryEncoding(output_mode="binary")
    feature_ds = feature_ds.map(index)
    encoder.adapt(feature_ds)
    encoded_feature = encoder(encoded_feature)
    return encoded_feature
Ejemplo n.º 11
0
def encode_inputs(inputs, encoding_size):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert a string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            index = StringLookup(vocabulary=vocabulary,
                                 mask_token=None,
                                 num_oov_indices=0)
            # Convert the string input values into integer indices.
            value_index = index(inputs[feature_name])
            # Create an embedding layer with the specified dimensions
            embedding_ecoder = layers.Embedding(input_dim=len(vocabulary),
                                                output_dim=encoding_size)
            # Convert the index values to embedding representations.
            encoded_feature = embedding_ecoder(value_index)
        else:
            # Project the numeric feature to encoding_size using linear transformation.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
            encoded_feature = layers.Dense(
                units=encoding_size)(encoded_feature)
        encoded_features.append(encoded_feature)
    return encoded_features
Ejemplo n.º 12
0
def encode_inputs(inputs, use_embedding=False):
    encoded_features = []
    for feature_name in inputs:
        if feature_name in CATEGORICAL_FEATURE_NAMES:
            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
            # Create a lookup to convert string values to an integer indices.
            # Since we are not using a mask token nor expecting any out of vocabulary
            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
            index = StringLookup(vocabulary=vocabulary,
                                 mask_token=None,
                                 num_oov_indices=0)
            # Convert the string input values into integer indices.
            value_index = index(inputs[feature_name])
            if use_embedding:
                embedding_dims = int(math.sqrt(len(vocabulary)))
                # Create an embedding layer with the specified dimensions.
                embedding_ecoder = layers.Embedding(input_dim=len(vocabulary),
                                                    output_dim=embedding_dims)
                # Convert the index values to embedding representations.
                encoded_feature = embedding_ecoder(value_index)
            else:
                # Create a one-hot encoder.
                onehot_encoder = CategoryEncoding(output_mode="binary")
                onehot_encoder.adapt(index(vocabulary))
                # Convert the index values to a one-hot representation.
                encoded_feature = onehot_encoder(value_index)
        else:
            # Use the numerical features as-is.
            encoded_feature = tf.expand_dims(inputs[feature_name], -1)

        encoded_features.append(encoded_feature)

    all_features = layers.concatenate(encoded_features)
    return all_features
Ejemplo n.º 13
0
class DataManager:
    def __init__(self, log_dir):
        self.log_dir = log_dir
        self.START_TOKEN = '[SOS]'
        self.END_TOKEN = '[EOS]'
        self.vocab = list(sorted(set(string.printable))) + [self.START_TOKEN, self.END_TOKEN]
        self.chars_to_ids = StringLookup(vocabulary=self.vocab)
        self.vocab_size = self.chars_to_ids.vocab_size()

    def load_dataset(self):
        ds = TextLineDataset(str(pathlib.Path(self.log_dir, 'file_names.txt')))
        ds = ds.take(5)
        ds = ds.map(self.parse_svg_img, num_parallel_calls=tf.data.experimental.AUTOTUNE)
        ds = ds.padded_batch(2, drop_remainder=True)
        
        return ds
            
    def parse_svg_img(self, file_name):
        svg_path = tf.strings.join([self.log_dir, '/svgs/', file_name, '.svg'])
        img_path = tf.strings.join([self.log_dir, '/imgs/', file_name, '.png'])

        svg = tf.io.read_file(svg_path)
        svg = tf.concat([[self.START_TOKEN], unicode_split(svg, 'UTF-8'), [self.END_TOKEN]], axis=0)
        svg = self.chars_to_ids(svg)
        
        img = tf.io.read_file(img_path)
        img = tf.io.decode_png(img, channels=3)
        img = tf.cast(img, tf.float32)
        img = img / 255.0
        
        return (svg, img), svg
Ejemplo n.º 14
0
    def __init__(self, vocabulary, embedding_dim, num_buckets, name=None):
        super(QREmbedding, self).__init__(name=name)
        self.num_buckets = num_buckets

        self.index_lookup = StringLookup(
            vocabulary=vocabulary, mask_token=None, num_oov_indices=0
        )
        self.q_embeddings = layers.Embedding(num_buckets, embedding_dim,)
        self.r_embeddings = layers.Embedding(num_buckets, embedding_dim,)
Ejemplo n.º 15
0
    def _encode_categorical_feature(
        feature: KerasTensor,
        name: str,
        dataset: Optional[BatchDataset],
    ) -> KerasTensor:
        """One-hot encode categorical features.

        Args:
            - feature: The input layer of the feature.
            - name: The feature's name (its column name in the original dataframe).
            - dataset: The training data, if not specified, return a no-op layer.

        Returns:
            The one-hot encoded tensor of the input feature.

        """
        # Return generic layer for the tuner initialization
        if not dataset:
            return KerasTensor(type_spec=TensorSpec(
                shape=(None, 1), dtype=tf.float32, name=None))

        # Create a StringLookup layer which will turn strings into integer indices
        index = StringLookup()

        # Prepare a Dataset that only yields our feature
        feature_ds = dataset.map(lambda x, y: x[name])
        feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))

        # Learn the set of possible string values and assign them a fixed integer index
        index.adapt(feature_ds)

        # Turn the string input into integer indices
        encoded_feature = index(feature)

        # Create a CategoryEncoding for our integer indices
        encoder = CategoryEncoding(output_mode="binary")

        # Learn the space of possible indices
        encoder.adapt(np.arange(index.vocab_size()))

        # Apply one-hot encoding to our indices{split + 1} / {n_splits}
        encoded_feature = encoder(encoded_feature)

        return encoded_feature
Ejemplo n.º 16
0
def embedding_encoder(vocabulary, embedding_dim, num_oov_indices=0, name=None):
    return keras.Sequential(
        [
            StringLookup(
                vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices
            ),
            layers.Embedding(
                input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim
            ),
        ],
        name=f"{name}_embedding" if name else None,
    )
Ejemplo n.º 17
0
    def get_svg_ds(self):
        data = GFile('datasets/svgs/simpleline.svg',
                     'rb').read().decode(encoding='UTF-8')

        # Get the list of the unique characters in the text
        vocab = ['e', 'g', 'n', 'r', '\n']
        vocab_size = len(vocab)

        # Build the id to char lookup table
        chars_to_ids = StringLookup(vocabulary=vocab)
        self.ids_to_chars_layer = StringLookup(
            vocabulary=chars_to_ids.get_vocabulary(), invert=True)

        # Split the entire text by character
        chars = unicode_split(data, 'UTF-8')
        ids_of_chars = chars_to_ids(chars)

        # Group characters to form sequences
        svg_ds = Dataset.from_tensor_slices(ids_of_chars)
        svg_ds = svg_ds.batch(C.SEQUENCE_LENGTH)
        svg_ds = svg_ds.batch(C.BATCH_SIZE)

        return svg_ds
Ejemplo n.º 18
0
    def __init__(self, emb_name, vocab):
        super(CustomEmbed, self).__init__()

        self.vocab = vocab
        self.vocab_size = len(vocab)
        self.output_dim = int(math.sqrt(self.vocab_size))

        self.custom_embed = layers.Embedding(input_dim=self.vocab_size,
                                             output_dim=self.output_dim,
                                             name=f"{emb_name}_embedding")
        self.stringLookUp = StringLookup(vocabulary=self.vocab,
                                         mask_token=None,
                                         num_oov_indices=0)
        print(emb_name, self.output_dim)
Ejemplo n.º 19
0
def character_decoder(encoder):
    """Character decoder

    Parameters:
        encoder: keras.preprocessing.StringLookup, character encoder.

    Returns:
        Character decoder(keras.preprocessing.StringLookup).
    """
    num_to_char = StringLookup(mask_token=None,
                               num_oov_indices=1,
                               vocabulary=encoder.get_vocabulary(),
                               invert=True)

    return num_to_char
Ejemplo n.º 20
0
def character_encoder(vocab):
    """Character encoder

    Parameters:
        vocab: list, characters to be encoded.

    Returns:
        Character encoder(keras.preprocessing.StringLookup).
    """
    char_to_num = StringLookup(mask_token=None,
                               num_oov_indices=0,
                               vocabulary=list(vocab),
                               invert=False)

    return char_to_num
"""
### Building the character vocabulary

Keras provides different preprocessing layers to deal with different modalities of data.
[This guide](https://keras.io/guides/preprocessing_layers/) provids a comprehensive introduction.
Our example involves preprocessing labels at the character
level. This means that if there are two labels, e.g. "cat" and "dog", then our character
vocabulary should be {a, c, d, g, o, t} (without any special tokens). We use the
[`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup/)
layer for this purpose.
"""

AUTOTUNE = tf.data.AUTOTUNE

# Mapping characters to integers.
char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)

# Mapping integers back to original characters.
num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(),
                           mask_token=None,
                           invert=True)
"""
### Resizing images without distortion

Instead of square images, many OCR models work with rectangular images. This will become
clearer in a moment when we will visualize a few samples from the dataset. While
aspect-unaware resizing square images does not introduce a significant amount of
distortion this is not the case for rectangular images. But resizing images to a uniform
size is a requirement for mini-batching. So we need to perform our resizing such that
the following criteria are met:
Ejemplo n.º 22
0
class WordEmbedding(layers.Layer):
    UNK_MARK = '[UNK]'
    REP_CHAR = '\uFFFD'

    def __init__(self, vocabulary, output_dim, normalize_unicode='NFKC', lower_case=False, zero_digits=False,
                 max_len=None, reserved_words=None, embed_type='dense_auto', adapt_cutoff=None, adapt_factor=4,
                 embeddings_initializer='uniform', **kwargs):
        super().__init__(**kwargs)
        self.input_spec = layers.InputSpec(min_ndim=1, max_ndim=2, dtype='string')

        if not isinstance(vocabulary, list) or not all(map(lambda x: isinstance(x, str), vocabulary)):
            raise ValueError('Expected "vocabulary" to be a list of strings')
        if len(vocabulary) != len(set(vocabulary)):
            raise ValueError('Expected "vocabulary" to contain unique values')
        self.vocabulary = vocabulary

        self.output_dim = output_dim
        self.normalize_unicode = normalize_unicode
        self.lower_case = lower_case
        self.zero_digits = zero_digits

        if max_len is not None and max_len < 3:
            raise ValueError('Expected "max_len" to be None or greater then 2')
        self.max_len = max_len

        if reserved_words and len(reserved_words) != len(set(reserved_words)):
            raise ValueError('Expected "reserved_words" to contain unique values')
        self.reserved_words = reserved_words

        if embed_type not in {'dense_auto', 'dense_cpu', 'adapt'}:
            raise ValueError('Expected "embed_type" to be one of "dense_auto", "dense_cpu" or "adapt"')
        self.embed_type = embed_type

        self.adapt_cutoff = adapt_cutoff
        self.adapt_factor = adapt_factor
        self.embeddings_initializer = initializers.get(embeddings_initializer)

        all_reserved_words = [] if reserved_words is None else [r for r in reserved_words if self.UNK_MARK != r]
        self._reserved_words = [self.UNK_MARK] + all_reserved_words

        miss_reserved_words = [m for m in self._reserved_words if m not in vocabulary]
        if miss_reserved_words:
            tf.get_logger().warning('Vocabulary missed some reserved_words values: {}. '
                                    'This may indicate an error in vocabulary estimation'.format(miss_reserved_words))

        clean_vocab = [w for w in vocabulary if w not in self._reserved_words]
        self._vocabulary = self._reserved_words + clean_vocab

    def vocab(self, word_counts, **kwargs):
        if not word_counts:
            raise ValueError('Can\'t estimate vocabulary with empty word counter')
        if not all(map(lambda k: isinstance(k, str), word_counts.keys())):
            raise ValueError('Expected all words to be strings')

        word_counts = Vocabulary(word_counts)
        word_tokens = word_counts.tokens()
        adapt_words = self.adapt(word_tokens)
        if 1 == adapt_words.shape.rank:
            adapt_words = adapt_words[..., None]

        adapt_counts = Vocabulary()
        for adapts, word in zip(adapt_words, word_tokens):
            adapts = np.char.decode(adapts.numpy().reshape([-1]).astype('S'), 'utf-8')
            for adapt in adapts:
                adapt_counts[adapt] += word_counts[word]

        return adapt_counts

    @tf_utils.shape_type_conversion
    def build(self, input_shape=None):
        self.squeeze = False
        if 2 == len(input_shape):
            if 1 != input_shape[-1]:
                raise ValueError(
                    'Input 0 of layer {} is incompatible with the layer: if ndim=2 expected axis[-1]=1, found '
                    'axis[-1]={}. Full shape received: {}'.format(self.name, input_shape[-1], input_shape))

            self.squeeze = True
            input_shape = input_shape[:1]

        self.lookup = StringLookup(vocabulary=self._vocabulary, mask_token=None, oov_token=self.UNK_MARK)
        self.lookup.build(input_shape)

        if 'adapt' == self.embed_type:
            self.embed = AdaptiveEmbedding(
                self.adapt_cutoff, self.lookup.vocabulary_size(), self.output_dim, factor=self.adapt_factor,
                embeddings_initializer=self.embeddings_initializer)
        else:
            self.embed = layers.Embedding(
                self.lookup.vocabulary_size(), self.output_dim, embeddings_initializer=self.embeddings_initializer)
            if 'dense_auto' == self.embed_type:
                self.embed.build(input_shape)
            else:  # 'dense_cpu' == self.embed_type
                with tf.device('cpu:0'):
                    self.embed.build(input_shape)

        super().build(input_shape)

    def adapt(self, inputs):
        inputs = tf.convert_to_tensor(inputs, dtype='string')

        if self.normalize_unicode:
            inputs = miss_text.normalize_unicode(inputs, form=self.normalize_unicode, skip=self._reserved_words)
        if self.lower_case:
            inputs = miss_text.lower_case(inputs, skip=self._reserved_words)
        if self.zero_digits:
            inputs = miss_text.zero_digits(inputs, skip=self._reserved_words)

        if self.max_len is not None:
            inputs_ = tf.stack([
                miss_text.sub_string(inputs, 0, self.max_len // 2, skip=self._reserved_words),
                tf.fill(tf.shape(inputs), self.REP_CHAR),
                miss_text.sub_string(inputs, -self.max_len // 2 + 1, -1, skip=self._reserved_words)],
                axis=-1)
            inputs_ = tf.strings.reduce_join(inputs_, axis=-1)
            sizes = tf.strings.length(inputs, unit='UTF8_CHAR')
            inputs = tf.where(sizes > self.max_len, inputs_, inputs)

        return inputs

    def call(self, inputs, **kwargs):
        if self.squeeze:
            # Workaround for Sequential model test
            inputs = tf.squeeze(inputs, axis=-1)

        adapts = self.adapt(inputs)
        indices = self.lookup(adapts)
        outputs = self.embed(indices)

        return outputs

    @tf_utils.shape_type_conversion
    def compute_output_shape(self, input_shape):
        return input_shape + (self.output_dim,)

    def get_config(self):
        config = super().get_config()
        config.update({
            'vocabulary': self.vocabulary,
            'output_dim': self.output_dim,
            'normalize_unicode': self.normalize_unicode,
            'lower_case': self.lower_case,
            'zero_digits': self.zero_digits,
            'max_len': self.max_len,
            'reserved_words': self.reserved_words,
            'embed_type': self.embed_type,
            'adapt_cutoff': self.adapt_cutoff,
            'adapt_factor': self.adapt_factor,
            'embeddings_initializer': initializers.serialize(self.embeddings_initializer)
        })

        return config
Ejemplo n.º 23
0
if __name__ == "__main__":
    if len(sys.argv) < 4:
        print("usage: python predict.py [input_path] [model_path] [output_path]")
        sys.exit()
    
    input_path = sys.argv[1]
    model_path = sys.argv[2]
    output_path = sys.argv[2]
    data_loader = DataLoader(input_path, training_ratio=0.7)

    raw_train_ds, raw_val_ds = data_loader.load()

    # Why N? for one encoding purpose, last character = [0, 0, 0, 0]
    VOCAB = ["A", "G", "T", "N"]
    string_lookup = StringLookup(vocabulary=VOCAB)

    AUTOTUNE = tf.data.experimental.AUTOTUNE
    BATCH_SIZE = 256
    SHUFFLE_SIZE = 1000

    encoded_train_ds = raw_train_ds.cache().shuffle(SHUFFLE_SIZE)
    encoded_train_ds = encoded_train_ds.prefetch(buffer_size=AUTOTUNE)
    encoded_train_ds = encoded_train_ds.map(preprocess)
    encoded_val_ds = raw_val_ds.cache().map(preprocess)

    train_ds = encoded_train_ds.cache().batch(BATCH_SIZE)
    train_ds = train_ds.prefetch(buffer_size=AUTOTUNE)
    val_ds = encoded_val_ds.cache().batch(BATCH_SIZE)

    model = TwoTowerModel(RNA_length=33, gRNA_length=23)
Ejemplo n.º 24
0
TARGET_FEATURE_NAME = "income_bracket"
# A list of the labels of the target features.
TARGET_LABELS = [" <=50K", " >50K"]
"""
## Create `tf.data.Dataset` objects for training and validation

We create an input function to read and parse the file, and convert features and labels
into a [`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets)
for training and validation. We also preprocess the input by mapping the target label
to an index.
"""

from tensorflow.keras.layers.experimental.preprocessing import StringLookup

target_label_lookup = StringLookup(vocabulary=TARGET_LABELS,
                                   mask_token=None,
                                   num_oov_indices=0)


def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
    dataset = tf.data.experimental.make_csv_dataset(
        csv_file_path,
        batch_size=batch_size,
        column_names=CSV_HEADER,
        column_defaults=COLUMN_DEFAULTS,
        label_name=TARGET_FEATURE_NAME,
        num_epochs=1,
        header=False,
        na_value="?",
        shuffle=shuffle,
    ).map(lambda features, target: (features, target_label_lookup(target)))
Ejemplo n.º 25
0
class WordShape(tf.keras.layers.Layer):
    SHAPE_HAS_CASE = 1
    SHAPE_LOWER_CASE = 2
    SHAPE_UPPER_CASE = 4
    SHAPE_TITLE_CASE = 8
    SHAPE_MIXED_CASE = 16
    SHAPE_ALL_CASES = SHAPE_HAS_CASE | SHAPE_LOWER_CASE | SHAPE_UPPER_CASE | SHAPE_TITLE_CASE | SHAPE_MIXED_CASE

    # Mean and std length from Universal Dependencies and large russian POS corporas
    # Tokens (split_words): 3.057 and 3.118
    # Words: 4.756 and 3.453
    SHAPE_LENGTH_NORM = 32

    SHAPE_LEFT_SAME = 64
    SHAPE_RIGHT_SAME = 128
    SHAPE_LEFT2_SAME = 256
    SHAPE_RIGHT2_SAME = 512
    SHAPE_ALL_SAME = SHAPE_LEFT_SAME | SHAPE_RIGHT_SAME | SHAPE_LEFT2_SAME | SHAPE_RIGHT2_SAME

    SHAPE_CHAR_CAT_FIRST = 1024
    SHAPE_CHAR_CAT_LAST = 2048
    SHAPE_CHAR_CAT_BOTH = SHAPE_CHAR_CAT_FIRST | SHAPE_CHAR_CAT_LAST

    SHAPE_ALL = SHAPE_ALL_CASES | SHAPE_LENGTH_NORM | SHAPE_ALL_SAME | SHAPE_CHAR_CAT_BOTH

    def __init__(self,
                 options,
                 mean_len=3.906,
                 std_len=3.285,
                 char_embed=5,
                 *args,
                 **kwargs):
        super(WordShape, self).__init__(*args, **kwargs)
        self.input_spec = tf.keras.layers.InputSpec(dtype='string')
        self._supports_ragged_inputs = True

        if 0 == options:
            raise ValueError('At least one shape option should be selected')

        self.options = options
        self.mean_len = mean_len
        self.std_len = std_len

    @tf_utils.shape_type_conversion
    def build(self, input_shape):
        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST or self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            category_vocab = [
                'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Me', 'Mc', 'Nd', 'Nl',
                'No', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Co', 'Cs', 'Pd', 'Ps',
                'Pe', 'Pc', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Pi', 'Pf'
            ]
            self.cat_lookup = StringLookup(num_oov_indices=0,
                                           oov_token='Cn',
                                           vocabulary=category_vocab)
            if self.cat_lookup.vocab_size() != 30:
                raise ValueError('Wrong vocabulary size')

        super(WordShape, self).build(input_shape)

    def call(self, inputs, **kwargs):
        outputs_one, outputs_many = [], []

        # Case
        any_case = self.SHAPE_HAS_CASE | self.SHAPE_LOWER_CASE | self.SHAPE_UPPER_CASE | self.SHAPE_TITLE_CASE | \
                   self.SHAPE_MIXED_CASE
        if self.options & any_case:
            inputs_lower = lower_case(inputs)
            inputs_upper = upper_case(inputs)
            has_case = tf.not_equal(inputs_lower, inputs_upper)

        if self.options & self.SHAPE_HAS_CASE:
            outputs_one.append(has_case)

        if self.options & self.SHAPE_LOWER_CASE or self.options & self.SHAPE_MIXED_CASE:
            is_lower = tf.logical_and(has_case, tf.equal(inputs, inputs_lower))
        if self.options & self.SHAPE_LOWER_CASE:
            outputs_one.append(is_lower)

        if self.options & self.SHAPE_UPPER_CASE or self.options & self.SHAPE_MIXED_CASE:
            is_upper = tf.logical_and(has_case, tf.equal(inputs, inputs_upper))
        if self.options & self.SHAPE_UPPER_CASE:
            outputs_one.append(is_upper)

        if self.options & self.SHAPE_TITLE_CASE or self.options & self.SHAPE_MIXED_CASE:
            inputs_title = title_case(inputs)
            is_title = tf.logical_and(has_case, tf.equal(inputs, inputs_title))
        if self.options & self.SHAPE_TITLE_CASE:
            outputs_one.append(is_title)

        if self.options & self.SHAPE_MIXED_CASE:
            no_case = tf.logical_not(has_case)
            is_mixed = tf.logical_not(
                tf.logical_or(tf.logical_or(no_case, is_lower),
                              tf.logical_or(is_upper, is_title)))
            outputs_one.append(is_mixed)

        # Length
        if self.options & self.SHAPE_LENGTH_NORM:
            length_norm = tf.strings.length(inputs, unit='UTF8_CHAR')
            length_norm = (tf.cast(length_norm, self.compute_dtype) -
                           self.mean_len) / self.std_len
            outputs_one.append(length_norm)

        # Same
        any_same = self.SHAPE_LEFT_SAME | self.SHAPE_RIGHT_SAME | self.SHAPE_LEFT2_SAME | self.SHAPE_RIGHT2_SAME
        if self.options & any_same:
            empty_pad = tf.zeros_like(inputs[..., :1])
            inputs_padded = tf.concat(
                [empty_pad, empty_pad, inputs, empty_pad, empty_pad], axis=-1)

        if self.options & (self.SHAPE_LEFT_SAME | self.SHAPE_RIGHT_SAME):
            same_one = tf.equal(inputs_padded[..., 1:],
                                inputs_padded[..., :-1])

        if self.options & self.SHAPE_LEFT_SAME:
            same_left = same_one[..., 1:-2]
            outputs_one.append(same_left)

        if self.options & self.SHAPE_RIGHT_SAME:
            same_right = same_one[..., 2:-1]
            outputs_one.append(same_right)

        if self.options & (self.SHAPE_LEFT2_SAME | self.SHAPE_RIGHT2_SAME):
            same_two = tf.equal(inputs_padded[..., 2:],
                                inputs_padded[..., :-2])

        if self.options & self.SHAPE_LEFT2_SAME:
            same_left2 = same_two[..., :-2]
            outputs_one.append(same_left2)

        if self.options & self.SHAPE_RIGHT2_SAME:
            same_right2 = same_two[..., 2:]
            outputs_one.append(same_right2)

        # Char category
        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST:
            first_cats = char_category(inputs)
            first_ids = self.cat_lookup(first_cats)
            first_feats = tf.one_hot(first_ids, depth=30)
            outputs_many.append(first_feats)

        if self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            last_cats = char_category(inputs, first=False)
            last_ids = self.cat_lookup(last_cats)
            last_feats = tf.one_hot(last_ids, depth=30)
            outputs_many.append(last_feats)

        outputs_one = [tf.cast(o, self.compute_dtype) for o in outputs_one]
        outputs_many = [tf.cast(o, self.compute_dtype) for o in outputs_many]

        if not outputs_one:
            return tf.concat(outputs_many, axis=-1)

        outputs_one = tf.stack(outputs_one, axis=-1)
        if not outputs_many:
            return outputs_one

        return tf.concat([outputs_one, *outputs_many], axis=-1)

    @tf_utils.shape_type_conversion
    def compute_output_shape(self, input_shape):
        units = 0
        options = [
            self.SHAPE_HAS_CASE, self.SHAPE_LOWER_CASE, self.SHAPE_UPPER_CASE,
            self.SHAPE_TITLE_CASE, self.SHAPE_MIXED_CASE,
            self.SHAPE_LENGTH_NORM, self.SHAPE_LEFT_SAME,
            self.SHAPE_RIGHT_SAME, self.SHAPE_LEFT2_SAME,
            self.SHAPE_RIGHT2_SAME
        ]
        for opt in options:
            if self.options & opt:
                units += 1

        if self.options & WordShape.SHAPE_CHAR_CAT_FIRST:
            units += 30
        if self.options & WordShape.SHAPE_CHAR_CAT_LAST:
            units += 30

        return input_shape + (units, )

    def get_config(self):
        config = super().get_config()
        config.update({
            'options': self.options,
            'mean_len': self.mean_len,
            'std_len': self.std_len
        })

        return config
Ejemplo n.º 26
0
def encode_input_features(inputs,
                          sequence_length,
                          USER_FEATURES,
                          CATEGORICAL_FEATURES_WITH_VOCABULARY,
                          movies,
                          genres,
                          include_user_id=True,
                          include_user_features=True,
                          include_movie_features=True):
    encoded_transformer_features = []
    encoded_other_features = []

    other_feature_names = []
    if include_user_id:
        other_feature_names.append("user_id")
    if include_movie_features:
        other_feature_names.extend(USER_FEATURES)

    # Encode user features.
    for feature_name in other_feature_names:
        # Conver the string input values into integer indices.
        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
        idx = StringLookup(vocabulary=vocabulary,
                           mask_token=None,
                           num_oov_indices=0)(inputs[feature_name])

        # Compute embedding dimensions.
        embedding_dims = int(math.sqrt(len(vocabulary)))

        # Create an embedding layer with the specified dimensions.
        embedding_encoder = layers.Embedding(
            input_dim=len(vocabulary),
            output_dim=embedding_dims,
            name=f"{feature_name}_embedding",
        )

        # Convert the index values to embedding representations.
        encoded_other_features.append(embedding_encoder(idx))

    # Create a single embedding vector for the user features.
    if len(encoded_other_features) > 1:
        encoded_other_features = layers.concatenate(encoded_other_features)
    elif len(encoded_other_features) == 1:
        encoded_other_features = encoded_other_features[0]
    else:
        encoded_other_features = None

    # Create a movie embedding encoder.
    movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["movie_id"]
    movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))

    # Create a lookup to convert string values to integer indices.
    movie_index_lookup = StringLookup(
        input_dim=len(movie_vocabulary),
        mask_token=None,
        num_oov_indices=0,
        name="movie_index_lookup",
    )

    # Create an embedding layer with the specified dimensions.
    movie_embedding_encoder = layers.Embedding(
        input_dim=len(movie_vocabulary),
        output_dim=movie_embedding_dims,
        name=f"movie_embedding",
    )

    # Create a vector lookup for movie genres.
    genre_vectors = movies[genres].to_numpy()
    movie_genres_lookup = layers.Embedding(
        input_dim=genre_vectors.shape[0],
        output_dim=genre_vectors.shape[1],
        embeddings_initializer=tf.keras.initializers.Constant(genre_vectors),
        trainable=False,
        name="genres_vector")

    # Create a processing layer for genres.
    movie_embedding_processor = layers.Dense(
        units=movie_embedding_dims,
        activation="relu",
        name="process_movie_embedding_with_genres",
    )

    # Define a function to encode a given movie id.
    def encode_movie(movie_id):
        # Convert the string input values into integer indices.
        movie_idx = movie_index_lookup(movie_id)
        movie_embedding = movie_embedding_encoder(movie_idx)
        encoded_movie = movie_embedding
        if include_movie_features:
            movie_genres_vector = movie_genres_lookup(movie_idx)
            encoded_movie = movie_embedding_processor(
                layers.concatenate([movie_embedding, movie_genres_vector]))
        return encoded_movie

    # Encoded target_movie_id.
    target_movie_id = inputs["target_movie_id"]
    encoded_target_movie = encode_movie(target_movie_id)

    # Encoding sequence movie_ids.
    sequence_movie_ids = inputs["sequence_movie_ids"]
    encoded_sequence_movies = encode_movie(sequence_movie_ids)

    # Create positional embedding.
    positional_embedding_encoder = layers.Embedding(
        input_dim=sequence_length,
        output_dim=movie_embedding_dims,
        name="positional_embedding",
    )
    positions = tf.range(start=0, limit=sequence_length - 1, delta=1)
    encoded_positions = positional_embedding_encoder(positions)

    # Retrieve sequence ratings to incorporate them into the encoding
    # of the movie.
    sequence_ratings = tf.expand_dims(inputs["sequence_ratings"], -1)

    # Add the positional encoding to the movie encodings and multiply
    # them by rating.
    encoded_sequence_movies_with_position_and_rating = layers.Multiply()([
        (encoded_sequence_movies + encoded_positions), sequence_ratings
    ])

    # Construct the transformer inputs.
    for encoded_movie in tf.unstack(
            encoded_sequence_movies_with_position_and_rating, axis=1):
        encoded_transformer_features.append(tf.expand_dims(encoded_movie, 1))
    encoded_transformer_features.append(encoded_target_movie)

    encoded_transformer_features = layers.concatenate(
        encoded_transformer_features, axis=1)

    return encoded_transformer_features, encoded_other_features
Ejemplo n.º 27
0
class OCR:
    def __init__(self, model_weight='mn_model_weight.h5', scale_ratio=1):
        self.scale_ratio = scale_ratio
        self.characters = sorted([
            *set("".join(
                sum(ArtsInfo.ArtNames, []) + ArtsInfo.TypeNames +
                list(ArtsInfo.MainAttrNames.values()) +
                list(ArtsInfo.SubAttrNames.values()) + list(".,+%0123456789")))
        ])
        # Mapping characters to integers
        self.char_to_num = StringLookup(vocabulary=list(self.characters),
                                        num_oov_indices=0,
                                        mask_token="")

        # Mapping integers back to original characters
        self.num_to_char = StringLookup(
            vocabulary=self.char_to_num.get_vocabulary(),
            oov_token="",
            mask_token="",
            invert=True)

        self.width = 240
        self.height = 16
        self.max_length = 15
        self.build_model(input_shape=(self.width, self.height))
        self.model.load_weights(model_weight)

    def detect_info(self, art_img):
        info = self.extract_art_info(art_img)
        x = np.concatenate([
            self.preprocess(info[key]).T[None, :, :, None]
            for key in sorted(info.keys())
        ],
                           axis=0)
        y = self.model.predict(x)
        y = self.decode(y)
        return {
            **{key: v
               for key, v in zip(sorted(info.keys()), y)},
            **{
                'star': self.detect_star(art_img)
            }
        }

    def extract_art_info(self, art_img):
        name = art_img.crop([i * self.scale_ratio for i in Config.name_coords])
        type = art_img.crop([i * self.scale_ratio for i in Config.type_coords])
        main_attr_name = art_img.crop(
            [i * self.scale_ratio for i in Config.main_attr_name_coords])
        main_attr_value = art_img.crop(
            [i * self.scale_ratio for i in Config.main_attr_value_coords])
        level = art_img.crop(
            [i * self.scale_ratio for i in Config.level_coords])
        subattr_1 = art_img.crop([
            i * self.scale_ratio for i in Config.subattr_1_coords
        ])  # [73, 83, 102]
        subattr_2 = art_img.crop(
            [i * self.scale_ratio for i in Config.subattr_2_coords])
        subattr_3 = art_img.crop(
            [i * self.scale_ratio for i in Config.subattr_3_coords])
        subattr_4 = art_img.crop(
            [i * self.scale_ratio for i in Config.subattr_4_coords])
        if np.all(
                np.abs(np.array(subattr_1, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_1
            del subattr_2
            del subattr_3
            del subattr_4
        elif np.all(
                np.abs(np.array(subattr_2, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_2
            del subattr_3
            del subattr_4
        elif np.all(
                np.abs(np.array(subattr_3, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_3
            del subattr_4
        elif np.all(
                np.abs(np.array(subattr_4, np.float) -
                       [[[73, 83, 102]]]).max(axis=-1) > 25):
            del subattr_4
        return {
            key: value
            for key, value in locals().items()
            if key not in ['art_img', 'self']
        }

    def detect_star(self, art_img):
        star = art_img.crop([i * self.scale_ratio for i in Config.star_coords])
        cropped_star = self.crop(self.normalize(self.to_gray(star)))
        coef = cropped_star.shape[1] / cropped_star.shape[0]
        coef = coef / 1.30882352 + 0.21568627
        return int(round(coef))

    def to_gray(self, text_img):
        text_img = np.array(text_img)
        if len(text_img.shape) > 2:
            text_img = (
                text_img[..., :3] @ [[[0.299], [0.587], [0.114]]])[:, :, 0]
        return np.array(text_img, np.float32)

    def normalize(self, img, auto_inverse=True):
        img -= img.min()
        img /= img.max()
        if auto_inverse and img[-1, -1] > 0.5:
            img = 1 - img
        return img

    def crop(self, img, tol=0.7):
        # img is 2D image data
        # tol  is tolerance
        mask = img > tol
        m, n = img.shape
        mask0, mask1 = mask.any(0), mask.any(1)
        col_start, col_end = mask0.argmax(), n - mask0[::-1].argmax()
        row_start, row_end = mask1.argmax(), m - mask1[::-1].argmax()
        #     print(row_end-row_start, col_end-col_start)
        return img[row_start:row_end, col_start:col_end]

    def resize_to_height(self, img):
        height = self.height
        return (np.array(
            Image.fromarray(np.uint8(img * 255)).resize(
                (int(img.shape[1] * height / img.shape[0]), height),
                Image.BILINEAR,
            )) / 255)

    def pad_to_width(self, img):
        width = self.width
        if img.shape[1] >= width:
            return img[:, :width]
        return np.pad(img, [[0, 0], [0, width - img.shape[1]]],
                      mode="constant",
                      constant_values=0)

    def preprocess(self, text_img):
        result = self.to_gray(text_img)
        result = self.normalize(result, True)
        result = self.crop(result)
        result = self.normalize(result, False)
        result = self.resize_to_height(result)
        result = self.pad_to_width(result)
        return result

    def decode(self, pred):
        input_len = np.ones(pred.shape[0]) * pred.shape[1]
        # Use greedy search. For complex tasks, you can use beam search
        results = ctc_decode(pred, input_length=input_len,
                             greedy=True)[0][0][:, :self.max_length]
        # Iterate over the results and get back the text
        output_text = []
        for res in results:
            res = self.num_to_char(res)
            res = reduce_join(res)
            res = res.numpy().decode("utf-8")
            output_text.append(res)
        return output_text

    def build_model(self, input_shape):
        input_img = Input(shape=(input_shape[0], input_shape[1], 1),
                          name="image",
                          dtype="float32")
        mobilenet = MobileNetV3_Small((input_shape[0], input_shape[1], 1),
                                      0,
                                      alpha=1.0,
                                      include_top=False).build()
        x = mobilenet(input_img)
        new_shape = ((input_shape[0] // 8), (input_shape[1] // 8) * 576)
        x = Reshape(target_shape=new_shape, name="reshape")(x)
        x = Dense(64, activation="relu", name="dense1")(x)
        x = Dropout(0.2)(x)

        # RNNs
        x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x)
        x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.25))(x)

        # Output layer
        output = Dense(len(self.characters) + 2,
                       activation="softmax",
                       name="dense2")(x)

        # Define the model
        self.model = Model(inputs=[input_img],
                           outputs=output,
                           name="ocr_model_v1")
Ejemplo n.º 28
0
def main():
    # Prepare the data.
    CSV_HEADER = [
        "age",
        "workclass",
        "fnlwgt",
        "education",
        "education_num",
        "marital_status",
        "occupation",
        "relationship",
        "race",
        "gender",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
        "native_country",
        "income_bracket",
    ]

    train_data_url = (
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    )
    train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)

    test_data_url = (
        "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
    )
    test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)

    print(f"Train dataset shape: {train_data.shape}")
    print(f"Test dataset shape: {test_data.shape}")

    # Remove the first record (because it is not a valid example) and a
    # trailing "dot" in the class labels.
    test_data = test_data[1:]
    test_data.income_bracket = test_data.income_bracket.apply(
        lambda value: value.replace(".", ""))

    # Store the training and test data splits locally as CSV files.
    train_data_file = "train_data.csv"
    test_data_file = "test_data.csv"

    train_data.to_csv(train_data_file, index=False, header=False)
    test_data.to_csv(test_data_file, index=False, header=False)

    # Define dataset metadata.
    # Here, define the metadata of the dataset that will be useful for
    # reading and parsing and encoding input features.
    # A list of numerical feature names.
    NUMERICAL_FEATURE_NAMES = [
        "age",
        "education_num",
        "capital_gain",
        "capital_loss",
        "hours_per_week",
    ]

    # A dictionary of the categorical features and their vocabulary.
    CATEGORICAL_FEATURES_WITH_VOCABULARY = {
        "workclass": sorted(list(train_data["workclass"].unique())),
        "education": sorted(list(train_data["education"].unique())),
        "marital_status": sorted(list(train_data["marital_status"].unique())),
        "occupation": sorted(list(train_data["occupation"].unique())),
        "relationship": sorted(list(train_data["relationship"].unique())),
        "race": sorted(list(train_data["race"].unique())),
        "gender": sorted(list(train_data["gender"].unique())),
        "native_country": sorted(list(train_data["native_country"].unique())),
    }

    # A list of the columns to ignore from the dataset.
    IGNORE_COLUMN_NAMES = ["fnlwgt"]

    # A list of the categorical feature names.
    CATEGORICAL_FEATURE_NAMES = list(
        CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())

    # A list of all the input features.
    FEATURE_NAMES = NUMERICAL_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES

    # A list of column default values for each feature.
    COLUMN_DEFAULTS = [[0.0] if feature_name in NUMERICAL_FEATURE_NAMES +
                       IGNORE_COLUMN_NAMES else ["NA"]
                       for feature_name in CSV_HEADER]

    # The name of the target feature.
    TARGET_FEATURE_NAME = "income_bracket"

    # A list of the labels of the target features.
    TARGET_LABELS = [" <=50K", " >50K"]

    # Create tf.data.Dataset objects for training and validation.
    target_label_lookup = StringLookup(vocabulary=TARGET_LABELS,
                                       mask_token=None,
                                       num_oov_indices=0)

    # Set up the code that will train and evaluate the model.
    learning_rate = 0.01
    batch_size = 265
    num_epochs = 10
    hidden_units = [64, 64]

    # Experiment 1: Train a decision tree model.
    # In this experiment, train a single neural decision tree model
    # that uses all input features.
    num_trees = 10
    depth = 10
    used_features_rate = 1.0
    num_classes = len(TARGET_LABELS)

    tree_model = create_tree_model(FEATURE_NAMES, NUMERICAL_FEATURE_NAMES,
                                   CATEGORICAL_FEATURE_NAMES,
                                   CATEGORICAL_FEATURES_WITH_VOCABULARY, depth,
                                   used_features_rate, num_classes)
    run_experiment(tree_model, learning_rate, train_data_file, test_data_file,
                   CSV_HEADER, COLUMN_DEFAULTS, TARGET_FEATURE_NAME,
                   target_label_lookup, batch_size, num_epochs)

    # Experiment 2: Train a forest model.
    # In this experiment, train a neural decision forest with num_trees
    # where each tree uses randomly selected 50% of the input features.
    # Can control the number of features to be used in each tree by
    # setting the used_features_rate variable. In addition, set the
    # depth to 5 instead of 10 compared to the previous experiment.
    num_trees = 25
    depth = 5
    used_features_rate = 0.5

    forest_model = create_forest_model(FEATURE_NAMES, NUMERICAL_FEATURE_NAMES,
                                       CATEGORICAL_FEATURE_NAMES,
                                       CATEGORICAL_FEATURES_WITH_VOCABULARY,
                                       num_trees, depth, used_features_rate,
                                       num_classes)
    run_experiment(forest_model, learning_rate, train_data_file,
                   test_data_file, CSV_HEADER, COLUMN_DEFAULTS,
                   TARGET_FEATURE_NAME, target_label_lookup, batch_size,
                   num_epochs)

    # Exit the program.
    exit(0)
Ejemplo n.º 29
0
 def __init__(self):
     super().__init__(StringLookup())