def din_logit_fn(features, mode, params): common = tf.feature_column.input_layer(features, params['feature_columns']) pid_vocab_size = params["vocab_size"]["product"] behaviorPids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorPids"]), pid_vocab_size) productId = tf.string_to_hash_bucket_fast(tf.as_string(features["productId"]), pid_vocab_size) good_emb, gid_emb = din_attention_layer(behaviorPids, productId, "product") bid_vocab_size = params["vocab_size"]["brand"] behaviorBids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorBids"]), bid_vocab_size) brandId = tf.string_to_hash_bucket_fast(tf.as_string(features["brandId"]), bid_vocab_size) brand_emb, bid_emb = din_attention_layer(behaviorBids, brandId, "brand") sid_vocab_size = params["vocab_size"]["seller"] behaviorSids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorSids"]), sid_vocab_size) sellerId = tf.string_to_hash_bucket_fast(tf.as_string(features["sellerId"]), sid_vocab_size) seller_emb, sid_emb = din_attention_layer(behaviorSids, sellerId, "seller") cid_vocab_size = params["vocab_size"]["cate"] behaviorCids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorCids"]), cid_vocab_size) cateId = tf.string_to_hash_bucket_fast(tf.as_string(features["cateId"]), cid_vocab_size) cate_emb, cid_emb = din_attention_layer(behaviorCids, cateId, "cate") c1id_vocab_size = params["vocab_size"]["cate1"] behaviorC1ids = tf.string_to_hash_bucket_fast(tf.as_string(features["behaviorC1ids"]), c1id_vocab_size) cate1Id = tf.string_to_hash_bucket_fast(tf.as_string(features["cate1Id"]), c1id_vocab_size) cate1_emb, c1id_emb = din_attention_layer(behaviorC1ids, cate1Id, "cate1") net = tf.concat([common, good_emb, cate1_emb, cate_emb, brand_emb, seller_emb, gid_emb, c1id_emb, cid_emb, bid_emb, sid_emb], axis=1) for units in params['hidden_units']: net = tf.layers.dense(net, units=units, activation=tf.nn.relu) if 'dropout_rate' in params and params['dropout_rate'] > 0.0: net = tf.layers.dropout(net, params['dropout_rate'], training=(mode == tf.estimator.ModeKeys.TRAIN)) logits = tf.layers.dense(net, units=1) return logits
def transform(self, x): """ :param x: tensor of string, E.g "a,b,c" :return: dict of ids(tensor) lists, E.g {sparse=[tensor(123), tensor(342), tensor(532)], inter_sparse=[tensor(12), tensor(32), tensor(52)]} """ hash_size = self.param_dict["hash_size"] feature_interaction = self.param_dict["feature_interaction"] with self.graph.as_default(): x_sparse = [] x_interact_sparse = [] with tf.name_scope("sparse_lookup"): for i in xrange(x.shape[1].value): x_i = tf.string_to_hash_bucket_fast( input="f{}_".format(i) + x[:, i], num_buckets=hash_size, name="sparse_feature_{}".format(i)) x_sparse.append(x_i) with tf.name_scope("inter_sparse_lookup"): for ixj in feature_interaction: i, j = ixj.split('x') x_ixj = x[:, int(i)] + "x" + x[:, int(j)] x_i = tf.string_to_hash_bucket_fast( input="f{}x{}_".format(i, j) + x_ixj, num_buckets=hash_size, name="sparse_feature_interact_{}x{}".format(i, j)) x_interact_sparse.append(x_i) return {"sparse": x_sparse, "inter_sparse": x_interact_sparse}
def build_categorial_features(self): self.client_type_one_hot = tf.one_hot(tf.string_to_hash_bucket_fast(self.client_type, self.CLIENT_TYPE_CNT), self.CLIENT_TYPE_CNT) self.gender_one_hot = tf.one_hot(tf.string_to_hash_bucket_fast(self.client_type, self.GENDER_CNT), self.GENDER_CNT)
def build_model(self): user_click_item_list_idx = tf.string_to_hash_bucket_fast(tf.as_string(self.user_click_item_list), self.ITEM_MOD) # # User Embedding Layer with tf.name_scope("user_tower"): with tf.name_scope('user_embedding'): user_item_click_avg_embed = self.get_seq_embedding(self.item_embedding, user_click_item_list_idx, self.user_click_item_list_len, self.item_embedding_size, 'sum') gender_one_hot = tf.one_hot(self.gender, self.GENDER_CNT) client_type_one_hot = tf.one_hot(self.client_type, self.CLIENT_TYPE_CNT) user_embed_concat = tf.concat( [user_item_click_avg_embed, gender_one_hot, client_type_one_hot], axis=-1) with tf.name_scope('layers'): user_layer_1 = tf.layers.dense(user_embed_concat, 1024, activation=tf.nn.tanh, name='user_first', kernel_initializer=tf.glorot_normal_initializer()) user_layer_2 = tf.layers.dense(user_layer_1, 512, activation=tf.nn.tanh, name='user_second', kernel_initializer=tf.glorot_normal_initializer()) user_layer_3 = tf.layers.dense(user_layer_2, self.item_embedding_size, activation=tf.nn.tanh, name='user_final', kernel_initializer=tf.glorot_normal_initializer()) self.user_embedding_final = user_layer_3 with tf.name_scope("item_tower"): target_item_idx = tf.string_to_hash_bucket_fast(tf.as_string(self.target_item_list), self.ITEM_MOD) target_cate_idx = tf.string_to_hash_bucket_fast(tf.as_string(self.target_cate_list), self.CATE_MOD) target_tag_idx = tf.string_to_hash_bucket_fast(tf.as_string(self.target_tag_list), self.TAG_MOD) target_item_id_embed = tf.nn.embedding_lookup(self.item_embedding, target_item_idx) target_cate_id_embed = tf.nn.embedding_lookup(self.cate_embedding, target_cate_idx) target_tag_id_embed = tf.nn.embedding_lookup(self.tag_embedding, target_tag_idx) target_embed_concat = tf.concat( [target_item_id_embed, target_cate_id_embed, target_tag_id_embed], axis=-1) with tf.name_scope('item_layers'): item_layer_1 = tf.layers.dense(target_embed_concat, self.item_embedding_size, activation=tf.nn.tanh, name='item_first', kernel_initializer=tf.glorot_normal_initializer()) self.item_embed_output = item_layer_1 item_embed_split = tf.split(self.item_embed_output, 2, 1) self.pos_embed_final = tf.squeeze(item_embed_split[0]) self.neg_embed_final = tf.squeeze(item_embed_split[1]) # if self.is_train: temp_user_embedding_final = tf.expand_dims(self.user_embedding_final, 1) target_embed_final = tf.transpose(self.item_embed_output, perm=[0, 2, 1]) self.logits = tf.squeeze(tf.matmul(temp_user_embedding_final, target_embed_final), axis=1) tensor_info_logits = tf.saved_model.utils.build_tensor_info(self.logits) self.saved_model_outputs["logits"] = tensor_info_logits
def din_model_fn(features, labels, mode, params): net = tf.feature_column.input_layer(features, params['feature_columns']) attention_keyword = tf.string_to_hash_bucket_fast(features["keyword_attention"], 500000) attention_keyword_embeddings = tf.get_variable(name="attention_keyword_embeddings", dtype=tf.float32, shape=[500000, 20]) # shape(batch_size, len, embedding_size) attention_keyword_emb = tf.nn.embedding_lookup(attention_keyword_embeddings, attention_keyword) attention_creativeid = tf.string_to_hash_bucket_fast(tf.as_string(features["creative_id"]), 200000) attention_creativeid_embeddings = tf.get_variable(name="attention_creativeid_embeddings", dtype=tf.float32, shape=[200000, 20]) # shape(batch_size, 1, embedding_size) attention_creativeid_emb = tf.nn.embedding_lookup(attention_creativeid_embeddings, attention_creativeid) keyword_creativeid_attention = attention_layer(attention_creativeid_emb, attention_keyword_emb) # (batchsize,embedding_size) last_deep_layer = build_deep_layers(net, params) last_cross_layer = build_cross_layers(net, params) last_layer = tf.concat([last_deep_layer, last_cross_layer, keyword_creativeid_attention], 1) # head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM) head = head_lib._binary_logistic_or_multi_class_head( # pylint: disable=protected-access n_classes=2, weight_column=None, label_vocabulary=None, loss_reduction=losses.Reduction.SUM) logits = tf.layers.dense(last_layer, units=head.logits_dimension, kernel_initializer=tf.glorot_uniform_initializer()) optimizer = tf.train.AdagradOptimizer(learning_rate=params['learning_rate']) preds = tf.sigmoid(logits) user_id = features['user_id'] label = features['label'] if mode == tf.estimator.ModeKeys.PREDICT: predictions = { 'probabilities': preds, 'user_id': user_id, 'label': label } export_outputs = { 'regression': tf.estimator.export.RegressionOutput(predictions['probabilities']) } return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs) return head.create_estimator_spec( features=features, mode=mode, labels=labels, logits=logits, train_op_fn=lambda loss: optimizer.minimize(loss, global_step=tf.train.get_global_step()) )
def build_categorial_features(self): self.class_id_idx = tf.string_to_hash_bucket_fast( self.class_id, self.CATE_CNT) self.class_id_one_hot = tf.one_hot(self.class_id_idx, self.CATE_CNT) self.gender = tf.string_to_number(self.gender, tf.int32) self.gender_one_hot = tf.one_hot(self.gender, self.GENDER_CNT) self.age = tf.string_to_number(self.age, tf.int32) self.age_one_hot = tf.one_hot(self.age, self.AGE_CNT) self.consume_level = tf.string_to_number(self.consume_level, tf.int32) self.consume_level_one_hot = tf.one_hot(self.consume_level, self.CONSUME_LEVEL_CNT) self.client_type = tf.string_to_number(self.client_type, tf.int32) self.client_type_one_hot = tf.one_hot(self.client_type, self.CLIENT_TYPE_CNT) self.item_id_hash = tf.string_to_hash_bucket_fast( self.item_id, self.ITEM_CNT) self.item_id_embed = tf.nn.embedding_lookup(self.item_embedding, self.item_id_hash) self.dev_brand_type = tf.string_to_number(self.dev_brand_type, tf.int32) self.dev_brand_type_one_hot = tf.one_hot(self.dev_brand_type, self.DEV_BRAND_TYPE_CNT) self.dev_type = tf.string_to_number(self.dev_type, tf.int32) self.dev_type_one_hot = tf.one_hot(self.dev_type, self.DEV_TYPE_CNT) self.dev_carrier = tf.string_to_number(self.dev_carrier, tf.int32) self.dev_carrier_one_hot = tf.one_hot(self.dev_carrier, self.DEV_CARRIER_CNT) self.dev_net = tf.string_to_number(self.dev_net, tf.int32) self.dev_net_one_hot = tf.one_hot(self.dev_net, self.DEV_NET_CNT) self.dev_brand_idx = tf.string_to_hash_bucket_fast( self.dev_brand, self.DEV_BRAND_CNT) self.dev_brand_one_hot = tf.one_hot(self.dev_brand_idx, self.DEV_BRAND_CNT) self.dev_os = tf.string_to_hash_bucket_fast(self.dev_os, self.DEV_OS_CNT) self.dev_os_one_hot = tf.one_hot(self.dev_os, self.DEV_OS_CNT) self.feed_type_idx = tf.string_to_hash_bucket_fast( self.feed_type, self.FEED_TYPE_CNT) self.feed_type_one_hot = tf.one_hot(self.feed_type_idx, self.FEED_TYPE_CNT)
def hash_strings(strings, hash_buckets, key=None, name=None): """Hash strings into buckets. Args: strings: a `Tensor` or `SparseTensor` of dtype `tf.string`. hash_buckets: the number of hash buckets. key: optional. An array of two Python `uint64`. If passed, output will be a deterministic function of `strings` and `key`. Note that hashing will be slower if this value is specified. name: (Optional) A name for this operation. Returns: A `Tensor` or `SparseTensor` of dtype `tf.int64` with the same shape as the input `strings`. Raises: TypeError: if `strings` is not a `Tensor` or `SparseTensor` of dtype `tf.string`. """ if (not isinstance(strings, (tf.Tensor, tf.SparseTensor))) or strings.dtype != tf.string: raise TypeError( 'Input to hash_strings must be a Tensor or SparseTensor of dtype ' 'string; got {}'. format(strings.dtype)) if isinstance(strings, tf.SparseTensor): return tf.SparseTensor(indices=strings.indices, values=hash_strings( strings.values, hash_buckets, key), dense_shape=strings.dense_shape) if name is None: name = 'hash_strings' if key is None: return tf.string_to_hash_bucket_fast(strings, hash_buckets, name=name) return tf.string_to_hash_bucket_strong(strings, hash_buckets, key, name=name)
def call(self, x, mask=None, **kwargs): if x.dtype != tf.string: x = tf.as_string(x, ) # # @根据字符串hash eg:num_buckets = 8 # mask_zero的话 映射结果是0-6(后面加1是1-7),7个桶,最终0的结果是0 # 不mask_zero的话,映射到0-7, 8个桶 try: hash_x = tf.string_to_hash_bucket_fast( x, self.num_buckets if not self.mask_zero else self.num_buckets - 1, name=None) # weak hash except: hash_x = tf.strings.to_hash_bucket_fast( x, self.num_buckets if not self.mask_zero else self.num_buckets - 1, name=None) # weak hash if self.mask_zero: mask_1 = tf.cast(tf.not_equal(x, "0"), 'int64') mask_2 = tf.cast(tf.not_equal(x, "0.0"), 'int64') # False转化为0 mask = mask_1 * mask_2 # x为0或0.0的话,mask是0 hash_x = (hash_x + 1) * mask # 0的值本来处理之后取值 1-8,但是乘上mask之后是0 return hash_x
def _input_fn(): with tf.name_scope('input'): filename_queue = tf.train.string_input_producer( filenames, num_epochs=num_epochs) reader = tf.TFRecordReader() _, serialized_example = reader.read_up_to(filename_queue) features = tf.parse_single_example( serialized_examples, { 'words': tf.VarLenFeature(tf.string), 'subreddit': tf.FixedLenFeature([1], tf.int64) }) padded_words = tf.sparse_to_dense(features['words'].indices, [sentence_length], features['words'].values, default_value='UNK') word_indices = tf.string_to_hash_bucket_fast( padded_words, vocab_size) sentences, subreddits = tf.train.shuffle_batch( [word_indices, features['subreddit']], batch_size, capacity=1000 + 3 * batch_size, min_after_dequeue=1000, enqueue_many=False) return sentences, subreddits
def _get_features_dict(input_dict): """Extracts features dict from input dict.""" hash_from_source_id = tf.string_to_hash_bucket_fast( input_dict[fields.InputDataFields.source_id], HASH_BINS) features = { 'ref_sec': input_dict['ref_sec'], 'query': input_dict['query'], 'query_box': input_dict['query_box'], 'ref': input_dict['ref'], 'query_shape': input_dict['query_shape'], 'query_sec': input_dict['query_sec'], fields.InputDataFields.true_image_shape: input_dict[fields.InputDataFields.true_image_shape], HASH_KEY: tf.cast(hash_from_source_id, tf.int32), } if fields.InputDataFields.original_image in input_dict: features[fields.InputDataFields.original_image] = input_dict[ fields.InputDataFields.original_image] return features
def testStringToOneHashBucketFast(self): with self.test_session(): input_string = tf.placeholder(tf.string) output = tf.string_to_hash_bucket_fast(input_string, 1) result = output.eval(feed_dict={input_string: ['a', 'b', 'c']}) self.assertAllEqual([0, 0, 0], result)
def sampled_softmax_loss(src_emb, pos_ids, neg_num, output_emb_table, output_emb_bias, node_size, s2h=True): """Sampled softmax loss. Args: src_emb: positive src embedding with shape [batch_size, dim] pos_ids: positive ids. output_emb_table: output_emb_bias: node_size: total node size. s2h: set True if need string to hash. """ if s2h: pos_ids = tf.as_string(pos_ids) pos_ids = tf.string_to_hash_bucket_fast( pos_ids, node_size, name='softmax_loss_to_hash_bucket_oper') loss = tf.nn.sampled_softmax_loss(weights=output_emb_table, biases=output_emb_bias, labels=tf.reshape(pos_ids, [-1, 1]), inputs=src_emb, num_sampled=neg_num, num_classes=node_size, partition_strategy='mod', remove_accidental_hits=True) return [tf.reduce_mean(loss), None, None]
def _instruction(self, instruction): # Split string. splitted = tf.string_split(instruction) dense = tf.sparse_tensor_to_dense(splitted, default_value='') length = tf.reduce_sum(tf.to_int32(tf.not_equal(dense, '')), axis=1) # To int64 hash buckets. Small risk of having collisions. Alternatively, a # vocabulary can be used. num_hash_buckets = 1000 buckets = tf.string_to_hash_bucket_fast(dense, num_hash_buckets) # Embed the instruction. Embedding size 20 seems to be enough. embedding_size = 20 embedding = snt.Embed(num_hash_buckets, embedding_size)(buckets) # Pad to make sure there is at least one output. padding = tf.to_int32(tf.equal(tf.shape(embedding)[1], 0)) embedding = tf.pad(embedding, [[0, 0], [0, padding], [0, 0]]) core = tf.contrib.rnn.LSTMBlockCell(64, name='language_lstm') output, _ = tf.nn.dynamic_rnn(core, embedding, length, dtype=tf.float32) # Return last output. return tf.reverse_sequence(output, length, seq_axis=1)[:, 0]
def text_module_fn(): weights = tf.get_variable("weights", dtype=tf.float32, shape=[100, 10]) # initializer=tf.random_uniform_initializer()) text = tf.placeholder(tf.string, shape=[None]) hash_buckets = tf.string_to_hash_bucket_fast(text, weights.get_shape()[0]) embeddings = tf.gather(weights, hash_buckets) hub.add_signature(inputs=text, outputs=embeddings)
def _get_features_dict(input_dict): """Extracts features dict from input dict.""" source_id = _replace_empty_string_with_random_number( input_dict[fields.InputDataFields.source_id]) hash_from_source_id = tf.string_to_hash_bucket_fast(source_id, HASH_BINS) features = { fields.InputDataFields.image: input_dict[fields.InputDataFields.image], HASH_KEY: tf.cast(hash_from_source_id, tf.int32), fields.InputDataFields.true_image_shape: input_dict[fields.InputDataFields.true_image_shape], fields.InputDataFields.original_image_spatial_shape: input_dict[fields.InputDataFields.original_image_spatial_shape] } if fields.InputDataFields.original_image in input_dict: features[fields.InputDataFields.original_image] = input_dict[ fields.InputDataFields.original_image] if fields.InputDataFields.image_additional_channels in input_dict: features[ fields.InputDataFields.image_additional_channels] = input_dict[ fields.InputDataFields.image_additional_channels] return features
def _graph_fn_apply(self, text_inputs): """ Args: text_inputs (SingleDataOp): The Text input to generate a hash bucket for. Returns: tuple: - SingleDataOp: The hash lookup table (int64) that can be used as input to embedding-lookups. - SingleDataOp: The length (number of words) of the longest string in the `text_input` batch. """ if get_backend() == "tf": # Split the input string. split_text_inputs = tf.string_split(source=text_inputs, delimiter=self.delimiter) # Build a tensor of n rows (number of items in text_inputs) words with dense = tf.sparse_tensor_to_dense(sp_input=split_text_inputs, default_value="") length = tf.reduce_sum(input_tensor=tf.to_int32(x=tf.not_equal(x=dense, y="")), axis=-1) if self.hash_function == "fast": hash_bucket = tf.string_to_hash_bucket_fast(input=dense, num_buckets=self.num_hash_buckets) else: hash_bucket = tf.string_to_hash_bucket_strong(input=dense, num_buckets=self.num_hash_buckets, key=self.hash_keys) # Int64 is tf's default for `string_to_hash_bucket` operation: Can leave as is. if self.dtype != "int64": hash_bucket = tf.cast(x=hash_bucket, dtype=dtype_(self.dtype)) # Hash-bucket output is always batch-major. hash_bucket._batch_rank = 0 hash_bucket._time_rank = 1 return hash_bucket, length
def _input_fn(): with tf.name_scope('input'): filename_queue = tf.train.string_input_producer( filenames, num_epochs=num_epochs) reader = tf.TFRecordReader() _, serialized_example = reader.read_up_to(filename_queue) features = tf.parse_single_example( serialized_examples, { 'words': tf.VarLenFeature(tf.string), 'subreddit': tf.FixedLenFeature([1], tf.int64) } ) padded_words = tf.sparse_to_dense( features['words'].indices, [sentence_length], features['words'].values, default_value='UNK' ) word_indices = tf.string_to_hash_bucket_fast( padded_words, vocab_size) sentences, subreddits = tf.train.shuffle_batch( [word_indices, features['subreddit']], batch_size, capacity=1000 + 3 * batch_size, min_after_dequeue=1000, enqueue_many=False ) return sentences, subreddits
def text_module_fn(): weights = tf.get_variable( "weights", dtype=tf.float32, shape=[100, 10]) # initializer=tf.random_uniform_initializer()) text = tf.placeholder(tf.string, shape=[None]) hash_buckets = tf.string_to_hash_bucket_fast(text, weights.get_shape()[0]) embeddings = tf.gather(weights, hash_buckets) hub.add_signature(inputs=text, outputs=embeddings)
def _model_fn(self): # feature_columns not include attention feature din_user_seq = tf.string_to_hash_bucket_fast(self.din_user_goods_seq, self.goods_bucket_size) din_target_id = tf.string_to_hash_bucket_fast(self.din_target_goods_id, self.goods_bucket_size) din_useq_embedding, din_tid_embedding = self.attention_layer( din_user_seq, din_target_id, self.goods_bucket_size, self.goods_embedding_size, self.goods_attention_hidden_units, id_type="click_seq") din_net = tf.concat( [self.common_layer, din_useq_embedding, din_tid_embedding], axis=1) logits = self.fc_net(din_net, 1) return logits
def hash_column(self,input_column,hash_bucket_size,name="hash_column"): info = "Hash: name = {}, bucket_size = {}, num_params = 0".format( name,hash_bucket_size ) self.params.append(info) with tf.variable_scope(name): col = tf.string_to_hash_bucket_fast(tf.as_string(input_column),hash_bucket_size) return col
def build_sequence_features(self): self.click_seq_50size_idx = tf.string_to_hash_bucket_fast(self.click_seq_50size_array, self.ITEM_CNT) self.click_seq_50size_embed = self.get_seq_embedding(self.item_embedding, self.click_seq_50size_idx, self.click_seq_50size_len, self.item_embedding_size, "sum")
def encode(self, input_attrs): """Encode input_attrs to embeddings. Args: input_attrs: A list in the format of [continuous_attrs, categorical_attrs] Returns: Embeddings. """ continuous_attrs = input_attrs[0] categorical_attrs = input_attrs[1] to_concats_cate = None if self._categorical_features: coalesced_attrs = [] for idx, attr_name, max_num, _ in self._categorical_features: attr = categorical_attrs[:, idx] + self._offsets[idx] coalesced_attrs.append(attr) with tf.device('/cpu:0'): attrs = tf.reshape(tf.stack(coalesced_attrs, axis=-1), [-1]) to_concats_cate = tf.nn.embedding_lookup(self._emb_table["coalesced_embed"], attrs, name=self._name + 'embedding_lookup', unique=True) if self._multivalent_features: for idx, attr_name, max_num, _ in self._multivalent_features: sparse_attr = tf.strings.split(categorical_attrs[:, idx], "|") ids = tf.string_to_hash_bucket_fast(sparse_attr.values, max_num, name=self._name + 'to_hash_bucket_%s' % (attr_name)) sparse_ids = tf.SparseTensor(sparse_attr.indices, ids, sparse_attr.dense_shape) with tf.device('/cpu:0'): to_concats_cate.append( tf.nn.embedding_lookup_sparse(self._emb_table[attr_name], sp_ids=sparse_ids, sp_weights=None, combiner='mean', name=self._name + 'embedding_lookup_sparse_%s' % (attr_name))) with tf.variable_scope(self._name + 'attrs_encoding', reuse=tf.AUTO_REUSE): raw_emb_con = None raw_emb_cate = None continuous_feats_num = self._feature_num - len(self._categorical_features) - \ len(self._multivalent_features) if continuous_feats_num > 0: # contains continuous features raw_emb_con = tf.reshape(continuous_attrs, [-1, continuous_feats_num]) if to_concats_cate is not None: raw_emb_cate = tf.reshape(to_concats_cate, [-1, len(self._categorical_features) * self.emb_dim]) if raw_emb_con is not None: if self._use_input_bn: raw_emb_con = tf.layers.batch_normalization(raw_emb_con, training=self._is_training) raw_emb = raw_emb_con if raw_emb_cate is not None: raw_emb = tf.concat([raw_emb_cate, raw_emb], axis=-1, name='con_cate_concat') else: print('no continuous feature to emb') raw_emb = raw_emb_cate if self._need_dense: raw_emb = tf.layers.dense(raw_emb, self._output_dim, activation=self._act, name='dense') return raw_emb
def in_training_set(line): """Returns a boolean tensor, true if the line is in the training set.""" # If you randomly split the dataset you won't get the same split in both # sessions if you stop and restart training later. Also a simple # random split won't work with a dataset that's too big to `.cache()` as # we are doing here. num_buckets = 1000000 bucket_id = tf.string_to_hash_bucket_fast(line, num_buckets) # Use the hash bucket id as a random number that's deterministic per example return bucket_id < int(train_fraction * num_buckets)
def input_parser(csv_columns, csv_column_defaults, categorical_cols, mutli_value_cols, line): items = tf.decode_csv(line, csv_column_defaults, '\t', na_value='null') features = dict(zip(csv_columns, items)) for cate in categorical_cols: f = features[cate] f = tf.one_hot(tf.string_to_hash_bucket_fast(f, categorical_cols[cate]), depth=categorical_cols[cate]) features[cate] = tf.squeeze(f) for cate in csv_columns: if cate in mutli_value_cols: f = features[cate] f = tf.string_split(tf.expand_dims(f, 0), ',').values f = tf.one_hot(tf.string_to_hash_bucket_fast(f, mutli_value_cols[cate]), depth=mutli_value_cols[cate]) f = tf.reduce_sum(f, 0) features[cate] = f elif cate not in categorical_cols: features[cate] = tf.expand_dims(tf.to_float(features[cate]), -1) return features
def testStringToHashBucketsFast(self): with self.test_session(): input_string = tf.placeholder(tf.string) output = tf.string_to_hash_bucket_fast(input_string, 10) result = output.eval(feed_dict={input_string: ['a', 'b', 'c', 'd']}) # Fingerprint64('a') -> 12917804110809363939 -> mod 10 -> 9 # Fingerprint64('b') -> 11795596070477164822 -> mod 10 -> 2 # Fingerprint64('c') -> 11430444447143000872 -> mod 10 -> 2 # Fingerprint64('d') -> 4470636696479570465 -> mod 10 -> 5 self.assertAllEqual([9, 2, 2, 5], result)
def preprocess_fn(inputs): for name in self.config.embedding_config: value_size = self.config.embedding_config[name][0] if inputs[name].dtype.is_integer: inputs[name] = tf.mod(inputs[name], value_size) else: inputs[name] = tf.string_to_hash_bucket_fast(inputs[name], value_size) inputs['context'] = tf.reshape(tf.to_float(inputs['context']), [-1, 1]) inputs['position'] = tf.reshape(tf.to_float(inputs['position']), [-1, 1]) inputs['label'] = tf.reshape(tf.to_float(inputs['label']), [-1, 1]) return inputs
def call(self, x, mask=None, **kwargs): if x.dtype != tf.string: x = tf.as_string(x, ) hash_x = tf.string_to_hash_bucket_fast(x, self.num_buckets if not self.mask_zero else self.num_buckets - 1, name=None) # weak hash if self.mask_zero: mask_1 = tf.cast(tf.not_equal(x, "0"), 'int64') mask_2 = tf.cast(tf.not_equal(x, "0.0"), 'int64') mask = mask_1 * mask_2 hash_x = (hash_x + 1) * mask return hash_x
def hash_embedding_lookup(self, tensor, fc): bucket_size = fc.get(TransformAttr.bucket_size) embedding_name = fc.get(TransformAttr.embedding_name) if embedding_name is None: embedding_name = fc.get(TransformAttr.feature_name) + '_embedding' id_tensor = tf.string_to_hash_bucket_fast(tensor, bucket_size) embeddings = self.embedding_dict[embedding_name] embed = tf.nn.embedding_lookup(embeddings, id_tensor) return embed
def _process_list_column(list_column, vocab_size): ''' stringlist col to dense tensor string col to onehot tensor ''' sparse_strings = tf.string_split(list_column, delimiter='##') sparse_ints = tf.SparseTensor(indices=sparse_strings.indices, values=tf.string_to_hash_bucket_fast( sparse_strings.values, vocab_size), dense_shape=sparse_strings.dense_shape) #return tf.cast(tf.sparse_to_indicator(sparse_ints, vocab_size = vocab_size), tf.float32), sparse_ints return sparse_ints
def build_sequence_features(self): # build feature self.tag_ids_array = tf.reshape(tf.strings.split(self.tag_ids, sep=";").values, shape=[tf.shape(self.user_id)[0], -1]) self.unclick_seq_50size_array = tf.reshape( tf.strings.split(self.unclick_seq_50size, sep=";").values, shape=[tf.shape(self.user_id)[0], -1]) self.click_seq_50size_array = tf.reshape( tf.strings.split(self.click_seq_50size, sep=";").values, shape=[tf.shape(self.user_id)[0], -1]) self.click_seq_50size_len = tf.count_nonzero( self.click_seq_50size_array, 1) self.tag_ids_len = tf.count_nonzero(self.tag_ids_array, 1) self.unclick_seq_50size_len = tf.count_nonzero( self.unclick_seq_50size_array, 1) self.click_seq_50size_hash = tf.string_to_hash_bucket_fast( self.click_seq_50size_array, self.ITEM_CNT) self.unclick_seq_50size_hash = tf.string_to_hash_bucket_fast( self.unclick_seq_50size_array, self.ITEM_CNT) self.tag_ids_hash = tf.string_to_hash_bucket_fast( self.tag_ids_array, self.TAG_CNT) # list features embed self.click_seq_50size_embed = self.get_seq_embedding( self.item_embedding, self.click_seq_50size_hash, self.click_seq_50size_len, self.item_embedding_size, "mean") self.uncclick_seq_50size_embed = self.get_seq_embedding( self.item_embedding, self.unclick_seq_50size_hash, self.unclick_seq_50size_len, self.item_embedding_size, "mean") self.tag_ids_embed = self.get_seq_embedding(self.tag_embedding, self.tag_ids_hash, self.tag_ids_len, self.tag_embedding_size, "mean")
def has_matching_bucket(self, feature): path = self._key_fn(feature) salted_path = tf.string_join( [path, tf.constant(self._salt, dtype=tf.string)]) N = 100 bucket = tf.cast(tf.string_to_hash_bucket_fast(salted_path, N), dtype=tf.float32) bucket = bucket / tf.constant(N, dtype=tf.float32) C0 = tf.less(self._thresh_A, bucket) C1 = tf.less_equal(bucket, self._thresh_B) return tf.logical_and(C0, C1)
def get_onehot(input_value, onehot_para, name="get_onehot"): ''' onehot string or int value ''' with tf.name_scope(name) as scope: if input_value.dtype == "string": hash_value = tf.string_to_hash_bucket_fast(input_value, onehot_para) else: hash_value = tf.string_to_number(input_value, out_type=tf.int32) onehot_emb = tf.one_hot(hash_value, onehot_para) return hash_value, onehot_emb
def _get_features_dict(input_dict): """Extracts features dict from input dict.""" hash_from_source_id = tf.string_to_hash_bucket_fast( input_dict[fields.InputDataFields.source_id], HASH_BINS) features = { fields.InputDataFields.image: input_dict[fields.InputDataFields.image], HASH_KEY: tf.cast(hash_from_source_id, tf.int32), fields.InputDataFields.true_image_shape: input_dict[fields.InputDataFields.true_image_shape] } if fields.InputDataFields.original_image in input_dict: features[fields.InputDataFields.original_image] = input_dict[ fields.InputDataFields.original_image] return features
def _get_features_dict(input_dict): """Extracts features dict from input dict.""" source_id = _replace_empty_string_with_random_number( input_dict[fields.InputDataFields.source_id]) hash_from_source_id = tf.string_to_hash_bucket_fast(source_id, HASH_BINS) features = { fields.InputDataFields.image: input_dict[fields.InputDataFields.image], HASH_KEY: tf.cast(hash_from_source_id, tf.int32), fields.InputDataFields.true_image_shape: input_dict[fields.InputDataFields.true_image_shape], fields.InputDataFields.original_image_spatial_shape: input_dict[fields.InputDataFields.original_image_spatial_shape] } if fields.InputDataFields.original_image in input_dict: features[fields.InputDataFields.original_image] = input_dict[ fields.InputDataFields.original_image] return features
def loss(self, input_batch, l2_regularization_strength=None, global_condition=None, local_condition=None, name='wavenet'): '''Creates a WaveNet network and returns the autoencoding loss. The variables are all scoped to the given name. ''' with tf.name_scope(name): # We mu-law encode and quantize the input audioform. input_batch = mu_law_encode(input_batch, self.quantization_channels) encoded = self._one_hot(input_batch) if self.scalar_input: network_input = tf.reshape( tf.cast(input_batch, tf.float32), [self.batch_size, -1, 1]) else: network_input = encoded if global_condition is not None: gc_encoded = tf.one_hot(global_condition, self.global_channels) else: gc_encoded = None if local_condition is not None: size = tf.shape(encoded)[0:2] lc_encoded = tf.string_to_hash_bucket_fast(local_condition, self.local_channels) lc_encoded = tf.one_hot(lc_encoded, self.local_channels) # This does a dumb upsampling of text data to audio sample rate # e.g. 'The car' -> 'TTTTTTThhhhhhheeeeee cccccaaaarrrr' lc_encoded = tf.image.resize_images(lc_encoded, size=size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) else: lc_encoded = None raw_output = self._create_network(network_input, global_condition=gc_encoded, local_condition=lc_encoded) with tf.name_scope('loss'): # Shift original input left by one sample, which means that # each output sample has to predict the next input sample. shifted = tf.slice(encoded, [0, 1, 0], [-1, tf.shape(encoded)[1] - 1, -1]) shifted = tf.pad(shifted, [[0, 0], [0, 1], [0, 0]]) prediction = tf.reshape(raw_output, [-1, self.quantization_channels]) loss = tf.nn.softmax_cross_entropy_with_logits( prediction, tf.reshape(shifted, [-1, self.quantization_channels])) reduced_loss = tf.reduce_mean(loss) tf.scalar_summary('loss', reduced_loss) if l2_regularization_strength is None: return reduced_loss else: # L2 regularization for all trainable parameters l2_loss = tf.add_n([tf.nn.l2_loss(v) for v in tf.trainable_variables() if not('bias' in v.name)]) # Add the regularization term to the loss total_loss = (reduced_loss + l2_regularization_strength * l2_loss) tf.scalar_summary('l2_loss', l2_loss) tf.scalar_summary('total_loss', total_loss) return total_loss
def _eval_input_fn(params=None): """Returns `features` and `labels` tensor dictionaries for evaluation. Args: params: Parameter dictionary passed from the estimator. Returns: features: Dictionary of feature tensors. features[fields.InputDataFields.image] is a [1, H, W, C] float32 tensor with preprocessed images. features[HASH_KEY] is a [1] int32 tensor representing unique identifiers for the images. features[fields.InputDataFields.true_image_shape] is a [1, 3] int32 tensor representing the true image shapes, as preprocessed images could be padded. features[fields.InputDataFields.original_image] is a [1, H', W', C] float32 tensor with the original image. labels: Dictionary of groundtruth tensors. labels[fields.InputDataFields.groundtruth_boxes] is a [1, num_boxes, 4] float32 tensor containing the corners of the groundtruth boxes. labels[fields.InputDataFields.groundtruth_classes] is a [num_boxes, num_classes] float32 one-hot tensor of classes. labels[fields.InputDataFields.groundtruth_area] is a [1, num_boxes] float32 tensor containing object areas. labels[fields.InputDataFields.groundtruth_is_crowd] is a [1, num_boxes] bool tensor indicating if the boxes enclose a crowd. labels[fields.InputDataFields.groundtruth_difficult] is a [1, num_boxes] int32 tensor indicating if the boxes represent difficult instances. -- Optional -- labels[fields.InputDataFields.groundtruth_instance_masks] is a [1, num_boxes, H, W] float32 tensor containing only binary values, which represent instance masks for objects. Raises: TypeError: if the `eval_config` or `eval_input_config` are not of the correct type. """ del params if not isinstance(eval_config, eval_pb2.EvalConfig): raise TypeError('For eval mode, the `eval_config` must be a ' 'train_pb2.EvalConfig.') if not isinstance(eval_input_config, input_reader_pb2.InputReader): raise TypeError('The `eval_input_config` must be a ' 'input_reader_pb2.InputReader.') if not isinstance(model_config, model_pb2.DetectionModel): raise TypeError('The `model_config` must be a ' 'model_pb2.DetectionModel.') num_classes = config_util.get_number_of_classes(model_config) model = model_builder.build(model_config, is_training=False) image_resizer_config = config_util.get_image_resizer_config(model_config) image_resizer_fn = image_resizer_builder.build(image_resizer_config) transform_data_fn = functools.partial( transform_input_data, model_preprocess_fn=model.preprocess, image_resizer_fn=image_resizer_fn, num_classes=num_classes, data_augmentation_fn=None, retain_original_image=True) dataset = dataset_builder.build(eval_input_config, transform_input_data_fn=transform_data_fn) input_dict = dataset_util.make_initializable_iterator(dataset).get_next() hash_from_source_id = tf.string_to_hash_bucket_fast( input_dict[fields.InputDataFields.source_id], HASH_BINS) features = { fields.InputDataFields.image: input_dict[fields.InputDataFields.image], fields.InputDataFields.original_image: input_dict[fields.InputDataFields.original_image], HASH_KEY: tf.cast(hash_from_source_id, tf.int32), fields.InputDataFields.true_image_shape: input_dict[fields.InputDataFields.true_image_shape] } labels = { fields.InputDataFields.groundtruth_boxes: input_dict[fields.InputDataFields.groundtruth_boxes], fields.InputDataFields.groundtruth_classes: input_dict[fields.InputDataFields.groundtruth_classes], fields.InputDataFields.groundtruth_area: input_dict[fields.InputDataFields.groundtruth_area], fields.InputDataFields.groundtruth_is_crowd: input_dict[fields.InputDataFields.groundtruth_is_crowd], fields.InputDataFields.groundtruth_difficult: tf.cast(input_dict[fields.InputDataFields.groundtruth_difficult], tf.int32) } if fields.InputDataFields.groundtruth_instance_masks in input_dict: labels[fields.InputDataFields.groundtruth_instance_masks] = input_dict[ fields.InputDataFields.groundtruth_instance_masks] # Add a batch dimension to the tensors. features = { key: tf.expand_dims(features[key], axis=0) for key, feature in features.items() } labels = { key: tf.expand_dims(labels[key], axis=0) for key, label in labels.items() } return features, labels
def _train_input_fn(params=None): """Returns `features` and `labels` tensor dictionaries for training. Args: params: Parameter dictionary passed from the estimator. Returns: features: Dictionary of feature tensors. features[fields.InputDataFields.image] is a [batch_size, H, W, C] float32 tensor with preprocessed images. features[HASH_KEY] is a [batch_size] int32 tensor representing unique identifiers for the images. features[fields.InputDataFields.true_image_shape] is a [batch_size, 3] int32 tensor representing the true image shapes, as preprocessed images could be padded. labels: Dictionary of groundtruth tensors. labels[fields.InputDataFields.num_groundtruth_boxes] is a [batch_size] int32 tensor indicating the number of groundtruth boxes. labels[fields.InputDataFields.groundtruth_boxes] is a [batch_size, num_boxes, 4] float32 tensor containing the corners of the groundtruth boxes. labels[fields.InputDataFields.groundtruth_classes] is a [batch_size, num_boxes, num_classes] float32 one-hot tensor of classes. labels[fields.InputDataFields.groundtruth_weights] is a [batch_size, num_boxes] float32 tensor containing groundtruth weights for the boxes. -- Optional -- labels[fields.InputDataFields.groundtruth_instance_masks] is a [batch_size, num_boxes, H, W] float32 tensor containing only binary values, which represent instance masks for objects. labels[fields.InputDataFields.groundtruth_keypoints] is a [batch_size, num_boxes, num_keypoints, 2] float32 tensor containing keypoints for each box. Raises: TypeError: if the `train_config` or `train_input_config` are not of the correct type. """ if not isinstance(train_config, train_pb2.TrainConfig): raise TypeError('For training mode, the `train_config` must be a ' 'train_pb2.TrainConfig.') if not isinstance(train_input_config, input_reader_pb2.InputReader): raise TypeError('The `train_input_config` must be a ' 'input_reader_pb2.InputReader.') if not isinstance(model_config, model_pb2.DetectionModel): raise TypeError('The `model_config` must be a ' 'model_pb2.DetectionModel.') data_augmentation_options = [ preprocessor_builder.build(step) for step in train_config.data_augmentation_options ] data_augmentation_fn = functools.partial( augment_input_data, data_augmentation_options=data_augmentation_options) model = model_builder.build(model_config, is_training=True) image_resizer_config = config_util.get_image_resizer_config(model_config) image_resizer_fn = image_resizer_builder.build(image_resizer_config) transform_data_fn = functools.partial( transform_input_data, model_preprocess_fn=model.preprocess, image_resizer_fn=image_resizer_fn, num_classes=config_util.get_number_of_classes(model_config), data_augmentation_fn=data_augmentation_fn) dataset = dataset_builder.build( train_input_config, transform_input_data_fn=transform_data_fn, batch_size=params['batch_size'] if params else train_config.batch_size, max_num_boxes=train_config.max_number_of_boxes, num_classes=config_util.get_number_of_classes(model_config), spatial_image_shape=config_util.get_spatial_image_size( image_resizer_config)) tensor_dict = dataset_util.make_initializable_iterator(dataset).get_next() hash_from_source_id = tf.string_to_hash_bucket_fast( tensor_dict[fields.InputDataFields.source_id], HASH_BINS) features = { fields.InputDataFields.image: tensor_dict[fields.InputDataFields.image], HASH_KEY: tf.cast(hash_from_source_id, tf.int32), fields.InputDataFields.true_image_shape: tensor_dict[ fields.InputDataFields.true_image_shape] } labels = { fields.InputDataFields.num_groundtruth_boxes: tensor_dict[ fields.InputDataFields.num_groundtruth_boxes], fields.InputDataFields.groundtruth_boxes: tensor_dict[ fields.InputDataFields.groundtruth_boxes], fields.InputDataFields.groundtruth_classes: tensor_dict[ fields.InputDataFields.groundtruth_classes], fields.InputDataFields.groundtruth_weights: tensor_dict[ fields.InputDataFields.groundtruth_weights] } if fields.InputDataFields.groundtruth_keypoints in tensor_dict: labels[fields.InputDataFields.groundtruth_keypoints] = tensor_dict[ fields.InputDataFields.groundtruth_keypoints] if fields.InputDataFields.groundtruth_instance_masks in tensor_dict: labels[fields.InputDataFields.groundtruth_instance_masks] = tensor_dict[ fields.InputDataFields.groundtruth_instance_masks] return features, labels
def main(): args = get_arguments() started_datestring = "{0:%Y-%m-%dT%H-%M-%S}".format(datetime.now()) logdir = os.path.join(args.logdir, 'generate', started_datestring) with open(args.wavenet_params, 'r') as config_file: wavenet_params = json.load(config_file) sess = tf.Session() net = WaveNetModel( batch_size=1, dilations=wavenet_params['dilations'], filter_width=wavenet_params['filter_width'], residual_channels=wavenet_params['residual_channels'], dilation_channels=wavenet_params['dilation_channels'], quantization_channels=wavenet_params['quantization_channels'], skip_channels=wavenet_params['skip_channels'], use_biases=wavenet_params['use_biases'], scalar_input=wavenet_params['scalar_input'], initial_filter_width=wavenet_params['initial_filter_width']) samples = tf.placeholder(tf.int32) if args.speaker_id: id_embedded = tf.one_hot([args.speaker_id], net.global_channels, axis=-1) else: id_embedded = None if args.speaker_text: text = np.reshape(list(args.speaker_text), (1,-1)) size = (1, wavenet_params['sample_rate']) text_embedded = tf.string_to_hash_bucket_fast(text, wavenet_params['local_channels']) text_embedded = tf.one_hot(text_embedded, wavenet_params['local_channels']) # This does a dumb upsampling of text data to audio sample rate # e.g. 'The car' -> 'TTTTTTThhhhhhheeeeee cccccaaaarrrr' text_embedded = tf.image.resize_images(text_embedded, size=size, method=tf.image.ResizeMethod.NEAREST_NEIGHBOR) else: text_embedded = None if args.fast_generation: next_sample = net.predict_proba_incremental(samples, id_embedded, text_embedded) else: next_sample = net.predict_proba(samples, id_embedded, text_embedded) if args.fast_generation: sess.run(tf.initialize_all_variables()) sess.run(net.init_ops) variables_to_restore = { var.name[:-2]: var for var in tf.all_variables() if not ('state_buffer' in var.name or 'pointer' in var.name)} saver = tf.train.Saver(variables_to_restore) print('Restoring model from {}'.format(args.checkpoint)) saver.restore(sess, args.checkpoint) decode = mu_law_decode(samples, wavenet_params['quantization_channels']) quantization_channels = wavenet_params['quantization_channels'] if args.wav_seed: seed = create_seed(args.wav_seed, wavenet_params['sample_rate'], quantization_channels) waveform = sess.run(seed).tolist() else: waveform = np.random.randint(quantization_channels, size=(1,)).tolist() if args.fast_generation and args.wav_seed: # When using the incremental generation, we need to # feed in all priming samples one by one before starting the # actual generation. # TODO This could be done much more efficiently by passing the waveform # to the incremental generator as an optional argument, which would be # used to fill the queues initially. outputs = [next_sample] outputs.extend(net.push_ops) print('Priming generation...') for i, x in enumerate(waveform[:-(args.window + 1)]): if i % 100 == 0: print('Priming sample {}'.format(i)) sess.run(outputs, feed_dict={samples: x}) print('Done.') last_sample_timestamp = datetime.now() for step in range(args.samples): if args.fast_generation: outputs = [next_sample] outputs.extend(net.push_ops) window = waveform[-1] else: if len(waveform) > args.window: window = waveform[-args.window:] else: window = waveform outputs = [next_sample] # Run the WaveNet to predict the next sample. prediction = sess.run(outputs, feed_dict={samples: window})[0] sample = np.random.choice( np.arange(quantization_channels), p=prediction) print(sample) waveform.append(sample) # Show progress only once per second. current_sample_timestamp = datetime.now() time_since_print = current_sample_timestamp - last_sample_timestamp if time_since_print.total_seconds() > 1.: print('Sample {:3<d}/{:3<d}'.format(step + 1, args.samples), end='\r') last_sample_timestamp = current_sample_timestamp # If we have partial writing, save the result so far. if (args.wav_out_path and args.save_every and (step + 1) % args.save_every == 0): out = sess.run(decode, feed_dict={samples: waveform}) write_wav(out, wavenet_params['sample_rate'], args.wav_out_path) # Introduce a newline to clear the carriage return from the progress. print() # Save the result as an audio summary. datestring = str(datetime.now()).replace(' ', 'T') writer = tf.train.SummaryWriter(logdir) tf.audio_summary('generated', decode, wavenet_params['sample_rate']) summaries = tf.merge_all_summaries() summary_out = sess.run(summaries, feed_dict={samples: np.reshape(waveform, [-1, 1])}) writer.add_summary(summary_out) # Save the result as a wav file. if args.wav_out_path: out = sess.run(decode, feed_dict={samples: waveform}) write_wav(out, wavenet_params['sample_rate'], args.wav_out_path) print('Finished generating. The result can be viewed in TensorBoard.')