def __init__(self, model_weight='mn_model_weight.h5', scale_ratio=1): self.scale_ratio = scale_ratio self.characters = sorted([ *set("".join( sum(ArtsInfo.ArtNames, []) + ArtsInfo.TypeNames + list(ArtsInfo.MainAttrNames.values()) + list(ArtsInfo.SubAttrNames.values()) + list(".,+%0123456789"))) ]) # Mapping characters to integers self.char_to_num = StringLookup(vocabulary=list(self.characters), num_oov_indices=0, mask_token="") # Mapping integers back to original characters self.num_to_char = StringLookup( vocabulary=self.char_to_num.get_vocabulary(), oov_token="", mask_token="", invert=True) self.width = 240 self.height = 16 self.max_length = 15 self.build_model(input_shape=(self.width, self.height)) self.model.load_weights(model_weight)
def encode_string_categorical_feature(feature, name, dataset): # Create a StringLookup layer which will turn strings into integer indices index = StringLookup() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the set of possible string values and assign them a fixed integer index index.adapt(feature_ds) # Turn the string input into integer indices encoded_feature = index(feature) # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Prepare a dataset of indices feature_ds = feature_ds.map(index) # Learn the space of possible indices encoder.adapt(feature_ds) # Apply one-hot encoding to our indices encoded_feature = encoder(encoded_feature) return encoded_feature
def build(self, input_shape=None): self.squeeze = False if 2 == len(input_shape): if 1 != input_shape[-1]: raise ValueError( 'Input 0 of layer {} is incompatible with the layer: if ndim=2 expected axis[-1]=1, found ' 'axis[-1]={}. Full shape received: {}'.format(self.name, input_shape[-1], input_shape)) self.squeeze = True input_shape = input_shape[:1] self.lookup = StringLookup(vocabulary=self._vocabulary, mask_token=None, oov_token=self.UNK_MARK) self.lookup.build(input_shape) if 'adapt' == self.embed_type: self.embed = AdaptiveEmbedding( self.adapt_cutoff, self.lookup.vocabulary_size(), self.output_dim, factor=self.adapt_factor, embeddings_initializer=self.embeddings_initializer) else: self.embed = layers.Embedding( self.lookup.vocabulary_size(), self.output_dim, embeddings_initializer=self.embeddings_initializer) if 'dense_auto' == self.embed_type: self.embed.build(input_shape) else: # 'dense_cpu' == self.embed_type with tf.device('cpu:0'): self.embed.build(input_shape) super().build(input_shape)
def _category_lookup(self, params: dict): key, input_layer = self._get_input_layer(params) num_oov_buckets = params.get('num_oov_buckets', 0) if input_layer.dtype == 'string': if 'vocabulary_file' in params.keys(): return StringLookup(max_tokens=params['vocabulary_size'], num_oov_indices=num_oov_buckets, mask_token=None, vocabulary=params['vocabulary_file'])(input_layer) elif 'vocabulary_list' in params.keys(): return StringLookup(max_tokens=len(params['vocabulary_list']) + num_oov_buckets, num_oov_indices=num_oov_buckets, mask_token=None, vocabulary=params['vocabulary_list'])(input_layer) else: if 'vocabulary_file' in params.keys(): return IntegerLookup(max_values=params['vocabulary_size'] + num_oov_buckets, num_oov_indices=num_oov_buckets, mask_value=None, vocabulary=['vocabulary_file'])(input_layer) elif 'vocabulary_list' in params.keys(): return IntegerLookup(max_values=len(params['vocabulary_list']) + num_oov_buckets, num_oov_indices=num_oov_buckets, mask_value=None, vocabulary=params['vocabulary_list'])(input_layer)
def __init__(self, log_dir): self.log_dir = log_dir self.START_TOKEN = '[SOS]' self.END_TOKEN = '[EOS]' self.vocab = list(sorted(set(string.printable))) + [self.START_TOKEN, self.END_TOKEN] self.chars_to_ids = StringLookup(vocabulary=self.vocab) self.vocab_size = self.chars_to_ids.vocab_size()
def encode_inputs(inputs): encoded_features = [] for feature_name in inputs: if feature_name in CATEGORICAL_FEATURE_NAMES: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] # Create a lookup to convert a string values to an integer indices. # Since we are not using a mask token, nor expecting any out of vocabulary # (oov) token, we set mask_token to None and num_oov_indices to 0. lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) # Convert the string input values into integer indices. value_index = lookup(inputs[feature_name]) embedding_dims = int(math.sqrt(lookup.vocabulary_size())) # Create an embedding layer with the specified dimensions. embedding = layers.Embedding(input_dim=lookup.vocabulary_size(), output_dim=embedding_dims) # Convert the index values to embedding representations. encoded_feature = embedding(value_index) else: # Use the numerical features as-is. encoded_feature = inputs[feature_name] if inputs[feature_name].shape[-1] is None: encoded_feature = tf.expand_dims(encoded_feature, -1) encoded_features.append(encoded_feature) encoded_features = layers.concatenate(encoded_features) return encoded_features
def load_data(self): data = GFile(self.file_path, 'rb').read().decode(encoding='UTF-8') # Get a list of the unique characters in the text vocab = list(sorted(set(data))) vocab_size = len(vocab) chars_to_ids = StringLookup(vocabulary=vocab) self.ids_to_chars_layer = StringLookup( vocabulary=chars_to_ids.get_vocabulary(), invert=True) # Split the entire text by character chars = unicode_split(data, 'UTF-8') ids_of_chars = chars_to_ids(chars) # Group characters to form sequences (+1 since the targets are shifted by one) sequences_ds = Dataset.from_tensor_slices(ids_of_chars) sequences_ds = sequences_ds.batch(C.SEQUENCE_LENGTH + 1) # Batch the sequences ds = sequences_ds.padded_batch(C.BATCH_SIZE) ds = ds.map(self._to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.shuffle(C.BUFFER_SIZE) ds = ds.prefetch(tf.data.experimental.AUTOTUNE) return ds
def load_data(): data = pd.read_csv( "https://storage.googleapis.com/tf-datasets/titanic/train.csv") from sklearn.model_selection import train_test_split labels = data.pop('survived') label_names = ["Not survived", "Survived"] features = {} # Converting CSV file into Tensorflow object for name, column in data.items(): dtype = column.dtype if dtype == object: dtype = string else: dtype = float32 features[name] = Input(shape=(1, ), name=name, dtype=dtype) # Extracting and normalizing numeric features numeric_features = { name: feature for name, feature in features.items() if feature.dtype == float32 } x = Concatenate()(list(numeric_features.values())) norm = Normalization() norm.adapt(np.array(data[numeric_features.keys()])) numeric_features = norm(x) processed_features = [numeric_features] # Extracting and normalizing non-numeric features for name, feature in features.items(): if feature.dtype == float32: continue word = StringLookup(vocabulary=np.unique(data[name])) one_hot = CategoryEncoding(max_tokens=word.vocab_size()) x = word(feature) x = one_hot(x) processed_features.append(x) processed_features = Concatenate()(processed_features) processed_features = Model(features, processed_features) utils.plot_model(model=processed_features, rankdir='LR', dpi=72, show_shapes=True) feature_dict = {name: np.array(value) for name, value in data.items()} train_features, test_features, train_labels, test_labels = train_test_split( processed_features(feature_dict).numpy(), labels, test_size=0.2) return train_features, train_labels, test_features, test_labels
def build(self, input_shape): if self.options & WordShape.SHAPE_CHAR_CAT_FIRST or self.options & WordShape.SHAPE_CHAR_CAT_LAST: category_vocab = [ 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Me', 'Mc', 'Nd', 'Nl', 'No', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Co', 'Cs', 'Pd', 'Ps', 'Pe', 'Pc', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Pi', 'Pf' ] self.cat_lookup = StringLookup(num_oov_indices=0, oov_token='Cn', vocabulary=category_vocab) if self.cat_lookup.vocab_size() != 30: raise ValueError('Wrong vocabulary size') super(WordShape, self).build(input_shape)
def encode_string_categorical_feature(feature, name, dataset): index = StringLookup() feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) index.adapt(feature_ds) encoded_feature = index(feature) encoder = CategoryEncoding(output_mode="binary") feature_ds = feature_ds.map(index) encoder.adapt(feature_ds) encoded_feature = encoder(encoded_feature) return encoded_feature
def encode_inputs(inputs, encoding_size): encoded_features = [] for feature_name in inputs: if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] # Create a lookup to convert a string values to an integer indices. # Since we are not using a mask token nor expecting any out of vocabulary # (oov) token, we set mask_token to None and num_oov_indices to 0. index = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) # Convert the string input values into integer indices. value_index = index(inputs[feature_name]) # Create an embedding layer with the specified dimensions embedding_ecoder = layers.Embedding(input_dim=len(vocabulary), output_dim=encoding_size) # Convert the index values to embedding representations. encoded_feature = embedding_ecoder(value_index) else: # Project the numeric feature to encoding_size using linear transformation. encoded_feature = tf.expand_dims(inputs[feature_name], -1) encoded_feature = layers.Dense( units=encoding_size)(encoded_feature) encoded_features.append(encoded_feature) return encoded_features
def encode_inputs(inputs, use_embedding=False): encoded_features = [] for feature_name in inputs: if feature_name in CATEGORICAL_FEATURE_NAMES: vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] # Create a lookup to convert string values to an integer indices. # Since we are not using a mask token nor expecting any out of vocabulary # (oov) token, we set mask_token to None and num_oov_indices to 0. index = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0) # Convert the string input values into integer indices. value_index = index(inputs[feature_name]) if use_embedding: embedding_dims = int(math.sqrt(len(vocabulary))) # Create an embedding layer with the specified dimensions. embedding_ecoder = layers.Embedding(input_dim=len(vocabulary), output_dim=embedding_dims) # Convert the index values to embedding representations. encoded_feature = embedding_ecoder(value_index) else: # Create a one-hot encoder. onehot_encoder = CategoryEncoding(output_mode="binary") onehot_encoder.adapt(index(vocabulary)) # Convert the index values to a one-hot representation. encoded_feature = onehot_encoder(value_index) else: # Use the numerical features as-is. encoded_feature = tf.expand_dims(inputs[feature_name], -1) encoded_features.append(encoded_feature) all_features = layers.concatenate(encoded_features) return all_features
class DataManager: def __init__(self, log_dir): self.log_dir = log_dir self.START_TOKEN = '[SOS]' self.END_TOKEN = '[EOS]' self.vocab = list(sorted(set(string.printable))) + [self.START_TOKEN, self.END_TOKEN] self.chars_to_ids = StringLookup(vocabulary=self.vocab) self.vocab_size = self.chars_to_ids.vocab_size() def load_dataset(self): ds = TextLineDataset(str(pathlib.Path(self.log_dir, 'file_names.txt'))) ds = ds.take(5) ds = ds.map(self.parse_svg_img, num_parallel_calls=tf.data.experimental.AUTOTUNE) ds = ds.padded_batch(2, drop_remainder=True) return ds def parse_svg_img(self, file_name): svg_path = tf.strings.join([self.log_dir, '/svgs/', file_name, '.svg']) img_path = tf.strings.join([self.log_dir, '/imgs/', file_name, '.png']) svg = tf.io.read_file(svg_path) svg = tf.concat([[self.START_TOKEN], unicode_split(svg, 'UTF-8'), [self.END_TOKEN]], axis=0) svg = self.chars_to_ids(svg) img = tf.io.read_file(img_path) img = tf.io.decode_png(img, channels=3) img = tf.cast(img, tf.float32) img = img / 255.0 return (svg, img), svg
def __init__(self, vocabulary, embedding_dim, num_buckets, name=None): super(QREmbedding, self).__init__(name=name) self.num_buckets = num_buckets self.index_lookup = StringLookup( vocabulary=vocabulary, mask_token=None, num_oov_indices=0 ) self.q_embeddings = layers.Embedding(num_buckets, embedding_dim,) self.r_embeddings = layers.Embedding(num_buckets, embedding_dim,)
def _encode_categorical_feature( feature: KerasTensor, name: str, dataset: Optional[BatchDataset], ) -> KerasTensor: """One-hot encode categorical features. Args: - feature: The input layer of the feature. - name: The feature's name (its column name in the original dataframe). - dataset: The training data, if not specified, return a no-op layer. Returns: The one-hot encoded tensor of the input feature. """ # Return generic layer for the tuner initialization if not dataset: return KerasTensor(type_spec=TensorSpec( shape=(None, 1), dtype=tf.float32, name=None)) # Create a StringLookup layer which will turn strings into integer indices index = StringLookup() # Prepare a Dataset that only yields our feature feature_ds = dataset.map(lambda x, y: x[name]) feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1)) # Learn the set of possible string values and assign them a fixed integer index index.adapt(feature_ds) # Turn the string input into integer indices encoded_feature = index(feature) # Create a CategoryEncoding for our integer indices encoder = CategoryEncoding(output_mode="binary") # Learn the space of possible indices encoder.adapt(np.arange(index.vocab_size())) # Apply one-hot encoding to our indices{split + 1} / {n_splits} encoded_feature = encoder(encoded_feature) return encoded_feature
def embedding_encoder(vocabulary, embedding_dim, num_oov_indices=0, name=None): return keras.Sequential( [ StringLookup( vocabulary=vocabulary, mask_token=None, num_oov_indices=num_oov_indices ), layers.Embedding( input_dim=len(vocabulary) + num_oov_indices, output_dim=embedding_dim ), ], name=f"{name}_embedding" if name else None, )
def get_svg_ds(self): data = GFile('datasets/svgs/simpleline.svg', 'rb').read().decode(encoding='UTF-8') # Get the list of the unique characters in the text vocab = ['e', 'g', 'n', 'r', '\n'] vocab_size = len(vocab) # Build the id to char lookup table chars_to_ids = StringLookup(vocabulary=vocab) self.ids_to_chars_layer = StringLookup( vocabulary=chars_to_ids.get_vocabulary(), invert=True) # Split the entire text by character chars = unicode_split(data, 'UTF-8') ids_of_chars = chars_to_ids(chars) # Group characters to form sequences svg_ds = Dataset.from_tensor_slices(ids_of_chars) svg_ds = svg_ds.batch(C.SEQUENCE_LENGTH) svg_ds = svg_ds.batch(C.BATCH_SIZE) return svg_ds
def __init__(self, emb_name, vocab): super(CustomEmbed, self).__init__() self.vocab = vocab self.vocab_size = len(vocab) self.output_dim = int(math.sqrt(self.vocab_size)) self.custom_embed = layers.Embedding(input_dim=self.vocab_size, output_dim=self.output_dim, name=f"{emb_name}_embedding") self.stringLookUp = StringLookup(vocabulary=self.vocab, mask_token=None, num_oov_indices=0) print(emb_name, self.output_dim)
def character_decoder(encoder): """Character decoder Parameters: encoder: keras.preprocessing.StringLookup, character encoder. Returns: Character decoder(keras.preprocessing.StringLookup). """ num_to_char = StringLookup(mask_token=None, num_oov_indices=1, vocabulary=encoder.get_vocabulary(), invert=True) return num_to_char
def character_encoder(vocab): """Character encoder Parameters: vocab: list, characters to be encoded. Returns: Character encoder(keras.preprocessing.StringLookup). """ char_to_num = StringLookup(mask_token=None, num_oov_indices=0, vocabulary=list(vocab), invert=False) return char_to_num
""" ### Building the character vocabulary Keras provides different preprocessing layers to deal with different modalities of data. [This guide](https://keras.io/guides/preprocessing_layers/) provids a comprehensive introduction. Our example involves preprocessing labels at the character level. This means that if there are two labels, e.g. "cat" and "dog", then our character vocabulary should be {a, c, d, g, o, t} (without any special tokens). We use the [`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup/) layer for this purpose. """ AUTOTUNE = tf.data.AUTOTUNE # Mapping characters to integers. char_to_num = StringLookup(vocabulary=list(characters), mask_token=None) # Mapping integers back to original characters. num_to_char = StringLookup(vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True) """ ### Resizing images without distortion Instead of square images, many OCR models work with rectangular images. This will become clearer in a moment when we will visualize a few samples from the dataset. While aspect-unaware resizing square images does not introduce a significant amount of distortion this is not the case for rectangular images. But resizing images to a uniform size is a requirement for mini-batching. So we need to perform our resizing such that the following criteria are met:
class WordEmbedding(layers.Layer): UNK_MARK = '[UNK]' REP_CHAR = '\uFFFD' def __init__(self, vocabulary, output_dim, normalize_unicode='NFKC', lower_case=False, zero_digits=False, max_len=None, reserved_words=None, embed_type='dense_auto', adapt_cutoff=None, adapt_factor=4, embeddings_initializer='uniform', **kwargs): super().__init__(**kwargs) self.input_spec = layers.InputSpec(min_ndim=1, max_ndim=2, dtype='string') if not isinstance(vocabulary, list) or not all(map(lambda x: isinstance(x, str), vocabulary)): raise ValueError('Expected "vocabulary" to be a list of strings') if len(vocabulary) != len(set(vocabulary)): raise ValueError('Expected "vocabulary" to contain unique values') self.vocabulary = vocabulary self.output_dim = output_dim self.normalize_unicode = normalize_unicode self.lower_case = lower_case self.zero_digits = zero_digits if max_len is not None and max_len < 3: raise ValueError('Expected "max_len" to be None or greater then 2') self.max_len = max_len if reserved_words and len(reserved_words) != len(set(reserved_words)): raise ValueError('Expected "reserved_words" to contain unique values') self.reserved_words = reserved_words if embed_type not in {'dense_auto', 'dense_cpu', 'adapt'}: raise ValueError('Expected "embed_type" to be one of "dense_auto", "dense_cpu" or "adapt"') self.embed_type = embed_type self.adapt_cutoff = adapt_cutoff self.adapt_factor = adapt_factor self.embeddings_initializer = initializers.get(embeddings_initializer) all_reserved_words = [] if reserved_words is None else [r for r in reserved_words if self.UNK_MARK != r] self._reserved_words = [self.UNK_MARK] + all_reserved_words miss_reserved_words = [m for m in self._reserved_words if m not in vocabulary] if miss_reserved_words: tf.get_logger().warning('Vocabulary missed some reserved_words values: {}. ' 'This may indicate an error in vocabulary estimation'.format(miss_reserved_words)) clean_vocab = [w for w in vocabulary if w not in self._reserved_words] self._vocabulary = self._reserved_words + clean_vocab def vocab(self, word_counts, **kwargs): if not word_counts: raise ValueError('Can\'t estimate vocabulary with empty word counter') if not all(map(lambda k: isinstance(k, str), word_counts.keys())): raise ValueError('Expected all words to be strings') word_counts = Vocabulary(word_counts) word_tokens = word_counts.tokens() adapt_words = self.adapt(word_tokens) if 1 == adapt_words.shape.rank: adapt_words = adapt_words[..., None] adapt_counts = Vocabulary() for adapts, word in zip(adapt_words, word_tokens): adapts = np.char.decode(adapts.numpy().reshape([-1]).astype('S'), 'utf-8') for adapt in adapts: adapt_counts[adapt] += word_counts[word] return adapt_counts @tf_utils.shape_type_conversion def build(self, input_shape=None): self.squeeze = False if 2 == len(input_shape): if 1 != input_shape[-1]: raise ValueError( 'Input 0 of layer {} is incompatible with the layer: if ndim=2 expected axis[-1]=1, found ' 'axis[-1]={}. Full shape received: {}'.format(self.name, input_shape[-1], input_shape)) self.squeeze = True input_shape = input_shape[:1] self.lookup = StringLookup(vocabulary=self._vocabulary, mask_token=None, oov_token=self.UNK_MARK) self.lookup.build(input_shape) if 'adapt' == self.embed_type: self.embed = AdaptiveEmbedding( self.adapt_cutoff, self.lookup.vocabulary_size(), self.output_dim, factor=self.adapt_factor, embeddings_initializer=self.embeddings_initializer) else: self.embed = layers.Embedding( self.lookup.vocabulary_size(), self.output_dim, embeddings_initializer=self.embeddings_initializer) if 'dense_auto' == self.embed_type: self.embed.build(input_shape) else: # 'dense_cpu' == self.embed_type with tf.device('cpu:0'): self.embed.build(input_shape) super().build(input_shape) def adapt(self, inputs): inputs = tf.convert_to_tensor(inputs, dtype='string') if self.normalize_unicode: inputs = miss_text.normalize_unicode(inputs, form=self.normalize_unicode, skip=self._reserved_words) if self.lower_case: inputs = miss_text.lower_case(inputs, skip=self._reserved_words) if self.zero_digits: inputs = miss_text.zero_digits(inputs, skip=self._reserved_words) if self.max_len is not None: inputs_ = tf.stack([ miss_text.sub_string(inputs, 0, self.max_len // 2, skip=self._reserved_words), tf.fill(tf.shape(inputs), self.REP_CHAR), miss_text.sub_string(inputs, -self.max_len // 2 + 1, -1, skip=self._reserved_words)], axis=-1) inputs_ = tf.strings.reduce_join(inputs_, axis=-1) sizes = tf.strings.length(inputs, unit='UTF8_CHAR') inputs = tf.where(sizes > self.max_len, inputs_, inputs) return inputs def call(self, inputs, **kwargs): if self.squeeze: # Workaround for Sequential model test inputs = tf.squeeze(inputs, axis=-1) adapts = self.adapt(inputs) indices = self.lookup(adapts) outputs = self.embed(indices) return outputs @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): return input_shape + (self.output_dim,) def get_config(self): config = super().get_config() config.update({ 'vocabulary': self.vocabulary, 'output_dim': self.output_dim, 'normalize_unicode': self.normalize_unicode, 'lower_case': self.lower_case, 'zero_digits': self.zero_digits, 'max_len': self.max_len, 'reserved_words': self.reserved_words, 'embed_type': self.embed_type, 'adapt_cutoff': self.adapt_cutoff, 'adapt_factor': self.adapt_factor, 'embeddings_initializer': initializers.serialize(self.embeddings_initializer) }) return config
if __name__ == "__main__": if len(sys.argv) < 4: print("usage: python predict.py [input_path] [model_path] [output_path]") sys.exit() input_path = sys.argv[1] model_path = sys.argv[2] output_path = sys.argv[2] data_loader = DataLoader(input_path, training_ratio=0.7) raw_train_ds, raw_val_ds = data_loader.load() # Why N? for one encoding purpose, last character = [0, 0, 0, 0] VOCAB = ["A", "G", "T", "N"] string_lookup = StringLookup(vocabulary=VOCAB) AUTOTUNE = tf.data.experimental.AUTOTUNE BATCH_SIZE = 256 SHUFFLE_SIZE = 1000 encoded_train_ds = raw_train_ds.cache().shuffle(SHUFFLE_SIZE) encoded_train_ds = encoded_train_ds.prefetch(buffer_size=AUTOTUNE) encoded_train_ds = encoded_train_ds.map(preprocess) encoded_val_ds = raw_val_ds.cache().map(preprocess) train_ds = encoded_train_ds.cache().batch(BATCH_SIZE) train_ds = train_ds.prefetch(buffer_size=AUTOTUNE) val_ds = encoded_val_ds.cache().batch(BATCH_SIZE) model = TwoTowerModel(RNA_length=33, gRNA_length=23)
TARGET_FEATURE_NAME = "income_bracket" # A list of the labels of the target features. TARGET_LABELS = [" <=50K", " >50K"] """ ## Create `tf.data.Dataset` objects for training and validation We create an input function to read and parse the file, and convert features and labels into a [`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets) for training and validation. We also preprocess the input by mapping the target label to an index. """ from tensorflow.keras.layers.experimental.preprocessing import StringLookup target_label_lookup = StringLookup(vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0) def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128): dataset = tf.data.experimental.make_csv_dataset( csv_file_path, batch_size=batch_size, column_names=CSV_HEADER, column_defaults=COLUMN_DEFAULTS, label_name=TARGET_FEATURE_NAME, num_epochs=1, header=False, na_value="?", shuffle=shuffle, ).map(lambda features, target: (features, target_label_lookup(target)))
class WordShape(tf.keras.layers.Layer): SHAPE_HAS_CASE = 1 SHAPE_LOWER_CASE = 2 SHAPE_UPPER_CASE = 4 SHAPE_TITLE_CASE = 8 SHAPE_MIXED_CASE = 16 SHAPE_ALL_CASES = SHAPE_HAS_CASE | SHAPE_LOWER_CASE | SHAPE_UPPER_CASE | SHAPE_TITLE_CASE | SHAPE_MIXED_CASE # Mean and std length from Universal Dependencies and large russian POS corporas # Tokens (split_words): 3.057 and 3.118 # Words: 4.756 and 3.453 SHAPE_LENGTH_NORM = 32 SHAPE_LEFT_SAME = 64 SHAPE_RIGHT_SAME = 128 SHAPE_LEFT2_SAME = 256 SHAPE_RIGHT2_SAME = 512 SHAPE_ALL_SAME = SHAPE_LEFT_SAME | SHAPE_RIGHT_SAME | SHAPE_LEFT2_SAME | SHAPE_RIGHT2_SAME SHAPE_CHAR_CAT_FIRST = 1024 SHAPE_CHAR_CAT_LAST = 2048 SHAPE_CHAR_CAT_BOTH = SHAPE_CHAR_CAT_FIRST | SHAPE_CHAR_CAT_LAST SHAPE_ALL = SHAPE_ALL_CASES | SHAPE_LENGTH_NORM | SHAPE_ALL_SAME | SHAPE_CHAR_CAT_BOTH def __init__(self, options, mean_len=3.906, std_len=3.285, char_embed=5, *args, **kwargs): super(WordShape, self).__init__(*args, **kwargs) self.input_spec = tf.keras.layers.InputSpec(dtype='string') self._supports_ragged_inputs = True if 0 == options: raise ValueError('At least one shape option should be selected') self.options = options self.mean_len = mean_len self.std_len = std_len @tf_utils.shape_type_conversion def build(self, input_shape): if self.options & WordShape.SHAPE_CHAR_CAT_FIRST or self.options & WordShape.SHAPE_CHAR_CAT_LAST: category_vocab = [ 'Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Mn', 'Me', 'Mc', 'Nd', 'Nl', 'No', 'Zs', 'Zl', 'Zp', 'Cc', 'Cf', 'Co', 'Cs', 'Pd', 'Ps', 'Pe', 'Pc', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Pi', 'Pf' ] self.cat_lookup = StringLookup(num_oov_indices=0, oov_token='Cn', vocabulary=category_vocab) if self.cat_lookup.vocab_size() != 30: raise ValueError('Wrong vocabulary size') super(WordShape, self).build(input_shape) def call(self, inputs, **kwargs): outputs_one, outputs_many = [], [] # Case any_case = self.SHAPE_HAS_CASE | self.SHAPE_LOWER_CASE | self.SHAPE_UPPER_CASE | self.SHAPE_TITLE_CASE | \ self.SHAPE_MIXED_CASE if self.options & any_case: inputs_lower = lower_case(inputs) inputs_upper = upper_case(inputs) has_case = tf.not_equal(inputs_lower, inputs_upper) if self.options & self.SHAPE_HAS_CASE: outputs_one.append(has_case) if self.options & self.SHAPE_LOWER_CASE or self.options & self.SHAPE_MIXED_CASE: is_lower = tf.logical_and(has_case, tf.equal(inputs, inputs_lower)) if self.options & self.SHAPE_LOWER_CASE: outputs_one.append(is_lower) if self.options & self.SHAPE_UPPER_CASE or self.options & self.SHAPE_MIXED_CASE: is_upper = tf.logical_and(has_case, tf.equal(inputs, inputs_upper)) if self.options & self.SHAPE_UPPER_CASE: outputs_one.append(is_upper) if self.options & self.SHAPE_TITLE_CASE or self.options & self.SHAPE_MIXED_CASE: inputs_title = title_case(inputs) is_title = tf.logical_and(has_case, tf.equal(inputs, inputs_title)) if self.options & self.SHAPE_TITLE_CASE: outputs_one.append(is_title) if self.options & self.SHAPE_MIXED_CASE: no_case = tf.logical_not(has_case) is_mixed = tf.logical_not( tf.logical_or(tf.logical_or(no_case, is_lower), tf.logical_or(is_upper, is_title))) outputs_one.append(is_mixed) # Length if self.options & self.SHAPE_LENGTH_NORM: length_norm = tf.strings.length(inputs, unit='UTF8_CHAR') length_norm = (tf.cast(length_norm, self.compute_dtype) - self.mean_len) / self.std_len outputs_one.append(length_norm) # Same any_same = self.SHAPE_LEFT_SAME | self.SHAPE_RIGHT_SAME | self.SHAPE_LEFT2_SAME | self.SHAPE_RIGHT2_SAME if self.options & any_same: empty_pad = tf.zeros_like(inputs[..., :1]) inputs_padded = tf.concat( [empty_pad, empty_pad, inputs, empty_pad, empty_pad], axis=-1) if self.options & (self.SHAPE_LEFT_SAME | self.SHAPE_RIGHT_SAME): same_one = tf.equal(inputs_padded[..., 1:], inputs_padded[..., :-1]) if self.options & self.SHAPE_LEFT_SAME: same_left = same_one[..., 1:-2] outputs_one.append(same_left) if self.options & self.SHAPE_RIGHT_SAME: same_right = same_one[..., 2:-1] outputs_one.append(same_right) if self.options & (self.SHAPE_LEFT2_SAME | self.SHAPE_RIGHT2_SAME): same_two = tf.equal(inputs_padded[..., 2:], inputs_padded[..., :-2]) if self.options & self.SHAPE_LEFT2_SAME: same_left2 = same_two[..., :-2] outputs_one.append(same_left2) if self.options & self.SHAPE_RIGHT2_SAME: same_right2 = same_two[..., 2:] outputs_one.append(same_right2) # Char category if self.options & WordShape.SHAPE_CHAR_CAT_FIRST: first_cats = char_category(inputs) first_ids = self.cat_lookup(first_cats) first_feats = tf.one_hot(first_ids, depth=30) outputs_many.append(first_feats) if self.options & WordShape.SHAPE_CHAR_CAT_LAST: last_cats = char_category(inputs, first=False) last_ids = self.cat_lookup(last_cats) last_feats = tf.one_hot(last_ids, depth=30) outputs_many.append(last_feats) outputs_one = [tf.cast(o, self.compute_dtype) for o in outputs_one] outputs_many = [tf.cast(o, self.compute_dtype) for o in outputs_many] if not outputs_one: return tf.concat(outputs_many, axis=-1) outputs_one = tf.stack(outputs_one, axis=-1) if not outputs_many: return outputs_one return tf.concat([outputs_one, *outputs_many], axis=-1) @tf_utils.shape_type_conversion def compute_output_shape(self, input_shape): units = 0 options = [ self.SHAPE_HAS_CASE, self.SHAPE_LOWER_CASE, self.SHAPE_UPPER_CASE, self.SHAPE_TITLE_CASE, self.SHAPE_MIXED_CASE, self.SHAPE_LENGTH_NORM, self.SHAPE_LEFT_SAME, self.SHAPE_RIGHT_SAME, self.SHAPE_LEFT2_SAME, self.SHAPE_RIGHT2_SAME ] for opt in options: if self.options & opt: units += 1 if self.options & WordShape.SHAPE_CHAR_CAT_FIRST: units += 30 if self.options & WordShape.SHAPE_CHAR_CAT_LAST: units += 30 return input_shape + (units, ) def get_config(self): config = super().get_config() config.update({ 'options': self.options, 'mean_len': self.mean_len, 'std_len': self.std_len }) return config
def encode_input_features(inputs, sequence_length, USER_FEATURES, CATEGORICAL_FEATURES_WITH_VOCABULARY, movies, genres, include_user_id=True, include_user_features=True, include_movie_features=True): encoded_transformer_features = [] encoded_other_features = [] other_feature_names = [] if include_user_id: other_feature_names.append("user_id") if include_movie_features: other_feature_names.extend(USER_FEATURES) # Encode user features. for feature_name in other_feature_names: # Conver the string input values into integer indices. vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name] idx = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)(inputs[feature_name]) # Compute embedding dimensions. embedding_dims = int(math.sqrt(len(vocabulary))) # Create an embedding layer with the specified dimensions. embedding_encoder = layers.Embedding( input_dim=len(vocabulary), output_dim=embedding_dims, name=f"{feature_name}_embedding", ) # Convert the index values to embedding representations. encoded_other_features.append(embedding_encoder(idx)) # Create a single embedding vector for the user features. if len(encoded_other_features) > 1: encoded_other_features = layers.concatenate(encoded_other_features) elif len(encoded_other_features) == 1: encoded_other_features = encoded_other_features[0] else: encoded_other_features = None # Create a movie embedding encoder. movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["movie_id"] movie_embedding_dims = int(math.sqrt(len(movie_vocabulary))) # Create a lookup to convert string values to integer indices. movie_index_lookup = StringLookup( input_dim=len(movie_vocabulary), mask_token=None, num_oov_indices=0, name="movie_index_lookup", ) # Create an embedding layer with the specified dimensions. movie_embedding_encoder = layers.Embedding( input_dim=len(movie_vocabulary), output_dim=movie_embedding_dims, name=f"movie_embedding", ) # Create a vector lookup for movie genres. genre_vectors = movies[genres].to_numpy() movie_genres_lookup = layers.Embedding( input_dim=genre_vectors.shape[0], output_dim=genre_vectors.shape[1], embeddings_initializer=tf.keras.initializers.Constant(genre_vectors), trainable=False, name="genres_vector") # Create a processing layer for genres. movie_embedding_processor = layers.Dense( units=movie_embedding_dims, activation="relu", name="process_movie_embedding_with_genres", ) # Define a function to encode a given movie id. def encode_movie(movie_id): # Convert the string input values into integer indices. movie_idx = movie_index_lookup(movie_id) movie_embedding = movie_embedding_encoder(movie_idx) encoded_movie = movie_embedding if include_movie_features: movie_genres_vector = movie_genres_lookup(movie_idx) encoded_movie = movie_embedding_processor( layers.concatenate([movie_embedding, movie_genres_vector])) return encoded_movie # Encoded target_movie_id. target_movie_id = inputs["target_movie_id"] encoded_target_movie = encode_movie(target_movie_id) # Encoding sequence movie_ids. sequence_movie_ids = inputs["sequence_movie_ids"] encoded_sequence_movies = encode_movie(sequence_movie_ids) # Create positional embedding. positional_embedding_encoder = layers.Embedding( input_dim=sequence_length, output_dim=movie_embedding_dims, name="positional_embedding", ) positions = tf.range(start=0, limit=sequence_length - 1, delta=1) encoded_positions = positional_embedding_encoder(positions) # Retrieve sequence ratings to incorporate them into the encoding # of the movie. sequence_ratings = tf.expand_dims(inputs["sequence_ratings"], -1) # Add the positional encoding to the movie encodings and multiply # them by rating. encoded_sequence_movies_with_position_and_rating = layers.Multiply()([ (encoded_sequence_movies + encoded_positions), sequence_ratings ]) # Construct the transformer inputs. for encoded_movie in tf.unstack( encoded_sequence_movies_with_position_and_rating, axis=1): encoded_transformer_features.append(tf.expand_dims(encoded_movie, 1)) encoded_transformer_features.append(encoded_target_movie) encoded_transformer_features = layers.concatenate( encoded_transformer_features, axis=1) return encoded_transformer_features, encoded_other_features
class OCR: def __init__(self, model_weight='mn_model_weight.h5', scale_ratio=1): self.scale_ratio = scale_ratio self.characters = sorted([ *set("".join( sum(ArtsInfo.ArtNames, []) + ArtsInfo.TypeNames + list(ArtsInfo.MainAttrNames.values()) + list(ArtsInfo.SubAttrNames.values()) + list(".,+%0123456789"))) ]) # Mapping characters to integers self.char_to_num = StringLookup(vocabulary=list(self.characters), num_oov_indices=0, mask_token="") # Mapping integers back to original characters self.num_to_char = StringLookup( vocabulary=self.char_to_num.get_vocabulary(), oov_token="", mask_token="", invert=True) self.width = 240 self.height = 16 self.max_length = 15 self.build_model(input_shape=(self.width, self.height)) self.model.load_weights(model_weight) def detect_info(self, art_img): info = self.extract_art_info(art_img) x = np.concatenate([ self.preprocess(info[key]).T[None, :, :, None] for key in sorted(info.keys()) ], axis=0) y = self.model.predict(x) y = self.decode(y) return { **{key: v for key, v in zip(sorted(info.keys()), y)}, **{ 'star': self.detect_star(art_img) } } def extract_art_info(self, art_img): name = art_img.crop([i * self.scale_ratio for i in Config.name_coords]) type = art_img.crop([i * self.scale_ratio for i in Config.type_coords]) main_attr_name = art_img.crop( [i * self.scale_ratio for i in Config.main_attr_name_coords]) main_attr_value = art_img.crop( [i * self.scale_ratio for i in Config.main_attr_value_coords]) level = art_img.crop( [i * self.scale_ratio for i in Config.level_coords]) subattr_1 = art_img.crop([ i * self.scale_ratio for i in Config.subattr_1_coords ]) # [73, 83, 102] subattr_2 = art_img.crop( [i * self.scale_ratio for i in Config.subattr_2_coords]) subattr_3 = art_img.crop( [i * self.scale_ratio for i in Config.subattr_3_coords]) subattr_4 = art_img.crop( [i * self.scale_ratio for i in Config.subattr_4_coords]) if np.all( np.abs(np.array(subattr_1, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_1 del subattr_2 del subattr_3 del subattr_4 elif np.all( np.abs(np.array(subattr_2, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_2 del subattr_3 del subattr_4 elif np.all( np.abs(np.array(subattr_3, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_3 del subattr_4 elif np.all( np.abs(np.array(subattr_4, np.float) - [[[73, 83, 102]]]).max(axis=-1) > 25): del subattr_4 return { key: value for key, value in locals().items() if key not in ['art_img', 'self'] } def detect_star(self, art_img): star = art_img.crop([i * self.scale_ratio for i in Config.star_coords]) cropped_star = self.crop(self.normalize(self.to_gray(star))) coef = cropped_star.shape[1] / cropped_star.shape[0] coef = coef / 1.30882352 + 0.21568627 return int(round(coef)) def to_gray(self, text_img): text_img = np.array(text_img) if len(text_img.shape) > 2: text_img = ( text_img[..., :3] @ [[[0.299], [0.587], [0.114]]])[:, :, 0] return np.array(text_img, np.float32) def normalize(self, img, auto_inverse=True): img -= img.min() img /= img.max() if auto_inverse and img[-1, -1] > 0.5: img = 1 - img return img def crop(self, img, tol=0.7): # img is 2D image data # tol is tolerance mask = img > tol m, n = img.shape mask0, mask1 = mask.any(0), mask.any(1) col_start, col_end = mask0.argmax(), n - mask0[::-1].argmax() row_start, row_end = mask1.argmax(), m - mask1[::-1].argmax() # print(row_end-row_start, col_end-col_start) return img[row_start:row_end, col_start:col_end] def resize_to_height(self, img): height = self.height return (np.array( Image.fromarray(np.uint8(img * 255)).resize( (int(img.shape[1] * height / img.shape[0]), height), Image.BILINEAR, )) / 255) def pad_to_width(self, img): width = self.width if img.shape[1] >= width: return img[:, :width] return np.pad(img, [[0, 0], [0, width - img.shape[1]]], mode="constant", constant_values=0) def preprocess(self, text_img): result = self.to_gray(text_img) result = self.normalize(result, True) result = self.crop(result) result = self.normalize(result, False) result = self.resize_to_height(result) result = self.pad_to_width(result) return result def decode(self, pred): input_len = np.ones(pred.shape[0]) * pred.shape[1] # Use greedy search. For complex tasks, you can use beam search results = ctc_decode(pred, input_length=input_len, greedy=True)[0][0][:, :self.max_length] # Iterate over the results and get back the text output_text = [] for res in results: res = self.num_to_char(res) res = reduce_join(res) res = res.numpy().decode("utf-8") output_text.append(res) return output_text def build_model(self, input_shape): input_img = Input(shape=(input_shape[0], input_shape[1], 1), name="image", dtype="float32") mobilenet = MobileNetV3_Small((input_shape[0], input_shape[1], 1), 0, alpha=1.0, include_top=False).build() x = mobilenet(input_img) new_shape = ((input_shape[0] // 8), (input_shape[1] // 8) * 576) x = Reshape(target_shape=new_shape, name="reshape")(x) x = Dense(64, activation="relu", name="dense1")(x) x = Dropout(0.2)(x) # RNNs x = Bidirectional(LSTM(128, return_sequences=True, dropout=0.25))(x) x = Bidirectional(LSTM(64, return_sequences=True, dropout=0.25))(x) # Output layer output = Dense(len(self.characters) + 2, activation="softmax", name="dense2")(x) # Define the model self.model = Model(inputs=[input_img], outputs=output, name="ocr_model_v1")
def main(): # Prepare the data. CSV_HEADER = [ "age", "workclass", "fnlwgt", "education", "education_num", "marital_status", "occupation", "relationship", "race", "gender", "capital_gain", "capital_loss", "hours_per_week", "native_country", "income_bracket", ] train_data_url = ( "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data" ) train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER) test_data_url = ( "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test" ) test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER) print(f"Train dataset shape: {train_data.shape}") print(f"Test dataset shape: {test_data.shape}") # Remove the first record (because it is not a valid example) and a # trailing "dot" in the class labels. test_data = test_data[1:] test_data.income_bracket = test_data.income_bracket.apply( lambda value: value.replace(".", "")) # Store the training and test data splits locally as CSV files. train_data_file = "train_data.csv" test_data_file = "test_data.csv" train_data.to_csv(train_data_file, index=False, header=False) test_data.to_csv(test_data_file, index=False, header=False) # Define dataset metadata. # Here, define the metadata of the dataset that will be useful for # reading and parsing and encoding input features. # A list of numerical feature names. NUMERICAL_FEATURE_NAMES = [ "age", "education_num", "capital_gain", "capital_loss", "hours_per_week", ] # A dictionary of the categorical features and their vocabulary. CATEGORICAL_FEATURES_WITH_VOCABULARY = { "workclass": sorted(list(train_data["workclass"].unique())), "education": sorted(list(train_data["education"].unique())), "marital_status": sorted(list(train_data["marital_status"].unique())), "occupation": sorted(list(train_data["occupation"].unique())), "relationship": sorted(list(train_data["relationship"].unique())), "race": sorted(list(train_data["race"].unique())), "gender": sorted(list(train_data["gender"].unique())), "native_country": sorted(list(train_data["native_country"].unique())), } # A list of the columns to ignore from the dataset. IGNORE_COLUMN_NAMES = ["fnlwgt"] # A list of the categorical feature names. CATEGORICAL_FEATURE_NAMES = list( CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()) # A list of all the input features. FEATURE_NAMES = NUMERICAL_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES # A list of column default values for each feature. COLUMN_DEFAULTS = [[0.0] if feature_name in NUMERICAL_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"] for feature_name in CSV_HEADER] # The name of the target feature. TARGET_FEATURE_NAME = "income_bracket" # A list of the labels of the target features. TARGET_LABELS = [" <=50K", " >50K"] # Create tf.data.Dataset objects for training and validation. target_label_lookup = StringLookup(vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0) # Set up the code that will train and evaluate the model. learning_rate = 0.01 batch_size = 265 num_epochs = 10 hidden_units = [64, 64] # Experiment 1: Train a decision tree model. # In this experiment, train a single neural decision tree model # that uses all input features. num_trees = 10 depth = 10 used_features_rate = 1.0 num_classes = len(TARGET_LABELS) tree_model = create_tree_model(FEATURE_NAMES, NUMERICAL_FEATURE_NAMES, CATEGORICAL_FEATURE_NAMES, CATEGORICAL_FEATURES_WITH_VOCABULARY, depth, used_features_rate, num_classes) run_experiment(tree_model, learning_rate, train_data_file, test_data_file, CSV_HEADER, COLUMN_DEFAULTS, TARGET_FEATURE_NAME, target_label_lookup, batch_size, num_epochs) # Experiment 2: Train a forest model. # In this experiment, train a neural decision forest with num_trees # where each tree uses randomly selected 50% of the input features. # Can control the number of features to be used in each tree by # setting the used_features_rate variable. In addition, set the # depth to 5 instead of 10 compared to the previous experiment. num_trees = 25 depth = 5 used_features_rate = 0.5 forest_model = create_forest_model(FEATURE_NAMES, NUMERICAL_FEATURE_NAMES, CATEGORICAL_FEATURE_NAMES, CATEGORICAL_FEATURES_WITH_VOCABULARY, num_trees, depth, used_features_rate, num_classes) run_experiment(forest_model, learning_rate, train_data_file, test_data_file, CSV_HEADER, COLUMN_DEFAULTS, TARGET_FEATURE_NAME, target_label_lookup, batch_size, num_epochs) # Exit the program. exit(0)
def __init__(self): super().__init__(StringLookup())