def testPreservesMetadata(self): s = schema.Struct( ('a', schema.Scalar(np.float32)), ('b', schema.Scalar(np.int32, metadata=schema.Metadata(categorical_limit=5))), ('c', schema.List( schema.Scalar( np.int32, metadata=schema.Metadata(categorical_limit=6))))) # attach metadata to lengths field s.c.lengths.set_metadata(schema.Metadata(categorical_limit=7)) self.assertEqual(None, s.a.metadata) self.assertEqual(5, s.b.metadata.categorical_limit) self.assertEqual(6, s.c.value.metadata.categorical_limit) self.assertEqual(7, s.c.lengths.metadata.categorical_limit) sc = s.clone() self.assertEqual(None, sc.a.metadata) self.assertEqual(5, sc.b.metadata.categorical_limit) self.assertEqual(6, sc.c.value.metadata.categorical_limit) self.assertEqual(7, sc.c.lengths.metadata.categorical_limit) sv = schema.from_blob_list(s, [ np.array([3.4]), np.array([2]), np.array([3]), np.array([1, 2, 3]) ]) self.assertEqual(None, sv.a.metadata) self.assertEqual(5, sv.b.metadata.categorical_limit) self.assertEqual(6, sv.c.value.metadata.categorical_limit) self.assertEqual(7, sv.c.lengths.metadata.categorical_limit)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=input_record.lengths, ) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=input_record.values, lengths_blob=input_record.lengths, ) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def __init__(self, model, input_record, seed, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.lengths_blob = schema.Scalar( np.int32, model.net.NextScopedBlob(name + "_lengths"), ) if schema.equal_schemas(input_record, IdList): self.modulo = self.extract_hash_size(input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, model.net.NextScopedBlob(name + "_hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=self.lengths_blob, ) elif schema.equal_schemas(input_record, IdScoreList): self.values_blob = schema.Scalar( np.float32, model.net.NextScopedBlob(name + "_values"), ) self.modulo = self.extract_hash_size(input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, model.net.NextScopedBlob(name + "_hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=self.values_blob, lengths_blob=self.lengths_blob, ) else: assert False, "Input type must be one of (IdList, IdScoreList)"
def set_request_only(field): for f in field.all_scalars(): categorical_limit, expected_value = None, None if not f.metadata: feature_specs = schema.FeatureSpec( feature_is_request_only=True, ) elif not f.metadata.feature_specs: categorical_limit = f.metadata.categorical_limit expected_value = f.metadata.expected_value feature_specs = schema.FeatureSpec( feature_is_request_only=True, ) else: categorical_limit = f.metadata.categorical_limit expected_value = f.metadata.expected_value feature_specs = schema.FeatureSpec( feature_type=f.metadata.feature_specs.feature_type, feature_names=f.metadata.feature_specs.feature_names, feature_ids=f.metadata.feature_specs.feature_ids, feature_is_request_only=True, ) # make sure not to set categorical_limit for a non-integer field if not np.issubdtype(f.field_type(), np.integer): assert categorical_limit is None, \ "categorical_limit shouldn't be set for no-integer field" f.set_metadata( schema.Metadata( categorical_limit=categorical_limit, expected_value=expected_value, feature_specs=feature_specs, ) )
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, expected_value=input_record.items.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdList) self.output_schema.items.set_metadata(metadata) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, expected_value=input_record.keys.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdScoreList) self.output_schema.keys.set_metadata(metadata) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) if input_record.lengths.metadata: self.output_schema.lengths.set_metadata( input_record.lengths.metadata) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=input_record.lengths, ) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=input_record.values, lengths_blob=input_record.lengths, ) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo)
def __init__(self, model, input_record, axis=1, add_axis=0, name='concat', **kwargs): super(Concat, self).__init__(model, name, input_record, **kwargs) self.axis = axis self.add_axis = add_axis assert not (axis == 0 and add_axis == 1), \ "It's not allowed to add axis=0" assert isinstance(input_record, schema.Struct),\ "Incorrect input type. Expected Struct, but received: {0}".\ format(input_record) shapes = [] for field_name, field_type in viewitems(input_record.fields): assert isinstance(field_type, schema.Scalar),\ "Incorrect input type for {}. Expected Scalar, but got: {}".\ format(field_name, field_type) # Assume that first dimension is batch, so actual axis in shape is # axis - 1 shape = list(field_type.field_type().shape) if add_axis: shape.insert(axis - 1, 1) assert len(shape) >= axis,\ "Concat expects that limited dimensions of the input tensor" shapes.append(shape) logger.info('Concat Layer input shapes: ' + str(shapes)) if axis == 0: self.output_schema = schema.from_blob_list( input_record[0], [self.get_next_blob_reference('output')]) return concat_dim = 0 for shape in shapes: concat_dim += shape[axis - 1] shape[axis - 1] = 0 assert shape == shapes[0],\ "Shapes {0} and {1} are not compatible for Concat".\ format(shape, shapes[0]) output_dims = shapes[0] output_dims[axis - 1] = concat_dim logger.info('Concat Layer output_dims: ' + str(output_dims)) self.output_schema = schema.Scalar( (np.float32, output_dims), self.get_next_blob_reference('output')) record_to_concat = input_record.fields.values() concated_feature_to_index = get_concatenated_feature_to_index( record_to_concat) if concated_feature_to_index: metadata = schema.Metadata(feature_specs=schema.FeatureSpec( feature_to_index=concated_feature_to_index)) self.output_schema.set_metadata(metadata)
def testSetRequestOnly(self): input_record = schema.Scalar(np.int64) schema.attach_metadata_to_scalars( input_record, schema.Metadata( categorical_limit=100000000, expected_value=99, feature_specs=schema.FeatureSpec(feature_ids=[1, 100, 1001]))) set_request_only(input_record) self.assertEqual(input_record.metadata.categorical_limit, 100000000) self.assertEqual(input_record.metadata.expected_value, 99) self.assertEqual(input_record.metadata.feature_specs.feature_ids, [1, 100, 1001])
def __init__(self, model, input_record, name='merged'): super(MergeIdLists, self).__init__(model, name, input_record) assert all(schema.equal_schemas(x, IdList) for x in input_record), \ "Inputs to MergeIdLists should all be IdLists." assert all(record.items.metadata is not None for record in self.input_record), \ "Features without metadata are not supported" merge_dim = max( get_categorical_limit(record) for record in self.input_record) assert merge_dim is not None, "Unbounded features are not supported" self.output_schema = schema.NewRecord( model.net, schema.List( schema.Scalar( np.int64, blob=model.net.NextBlob(name), metadata=schema.Metadata(categorical_limit=merge_dim))))
def testMergeIdListsLayer(self, num_inputs, batch_size): inputs = [] for _ in range(num_inputs): lengths = np.random.randint(5, size=batch_size).astype(np.int32) size = lengths.sum() values = np.random.randint(1, 10, size=size).astype(np.int64) inputs.append(lengths) inputs.append(values) input_schema = schema.Tuple(*[ schema.List( schema.Scalar(dtype=np.int64, metadata=schema.Metadata(categorical_limit=20))) for _ in range(num_inputs) ]) input_record = schema.NewRecord(self.model.net, input_schema) schema.FeedRecord(input_record, inputs) output_schema = self.model.MergeIdLists(input_record) assert schema.equal_schemas(output_schema, IdList, check_field_names=False)
def testSparseLookup(self): record = schema.NewRecord(self.model.net, schema.Struct( ('sparse', schema.Struct( ('sparse_feature_0', schema.List( schema.Scalar(np.int64, metadata=schema.Metadata(categorical_limit=1000)))), )), )) embedding_dim = 64 embedding_after_pooling = self.model.SparseLookup( record.sparse.sparse_feature_0, [embedding_dim], 'Sum') self.model.output_schema = embedding_after_pooling self.assertEqual( schema.Scalar((np.float32, (embedding_dim, ))), embedding_after_pooling ) train_init_net, train_net = self.get_training_nets() init_ops = self.assertNetContainOps( train_init_net, [ OpSpec("UniformFill", None, None), OpSpec("ConstantFill", None, None), ] ) sparse_lookup_op_spec = OpSpec( 'SparseLengthsSum', [ init_ops[0].output[0], record.sparse.sparse_feature_0.items(), record.sparse.sparse_feature_0.lengths(), ], [embedding_after_pooling()] ) self.assertNetContainOps(train_net, [sparse_lookup_op_spec]) predict_net = self.get_predict_net() self.assertNetContainOps(predict_net, [sparse_lookup_op_spec])
def __init__(self, model, input_record, input_specs, name='sparse_to_dense', **kwargs): """ `input_specs` follows the format of FeatureSpec from schema. To be more precise it's a namedtuple that should have: 'feature_type', 'feature_names', 'feature_ids' """ super(SparseToDense, self).__init__(model, name, input_record, **kwargs) self.input_specs = input_specs outputs = [] for field, feature_specs in self.input_specs: assert len(feature_specs.feature_names) ==\ len(feature_specs.feature_ids) if feature_specs.feature_type == 'FLOAT': outputs.append( (field, schema.Scalar( (np.float32, (len(feature_specs.feature_ids), )), model.net.NextScopedBlob(name + '_' + field + '_output')))) elif feature_specs.feature_type == 'ID_LIST': outputs.append( (field, schema.Struct( ( 'ranges', schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), model.net.NextScopedBlob(name + '_' + field + '_ranges')), ), ('values', input_record[field].values.items), ))) elif feature_specs.feature_type == 'ID_SCORE_LIST': outputs.append( (field, schema.Struct( ( 'ranges', schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), model.net.NextScopedBlob(name + '_' + field + '_ranges')), ), ('ids', input_record[field].values.keys), ('scores', input_record[field].values.values), ))) else: raise TypeError("Unsupported input type: {0}".format( feature_specs.feature_type)) # TODO(amalevich): This schema is producing ranges. And thus if there is # something using it it should support ranges as well. It might be # confusing, if we don't add better support for ranges/have it as a # first layer self.output_schema = schema.Struct(*outputs) # TODO(amalevich): Consider moving this data to schema, instead # Structs doens't support attaching metadata to them and clonning # will break things badly, but this is the most elegant way to pass # this info around. Should we change it or it'll be too much work and # not worse it? for field, feature_specs in input_specs: schema.attach_metadata_to_scalars( self.output_schema[field], schema.Metadata(feature_specs=feature_specs)) self.zero = model.global_constants['ZERO'] self.zero_range = model.global_constants['ZERO_RANGE']
def __init__(self, model, input_record, inner_shape, reducer, weight_init=None, weight_optim=None, name='sparse_lookup', **kwargs): super(SparseLookup, self).__init__(model, name, input_record, **kwargs) if isinstance(inner_shape, int): inner_shape = [inner_shape] assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\ "Unexpected type for inner_shape, expected list or tuple, got {0}".\ format(type(inner_shape)) # TODO Add some asserts about input type assert reducer in self._supported_reducers, "Unsupported reducer: {}".\ format(reducer) self.reducer = reducer assert input_record.items.metadata is not None,\ "Features without metadata are not supported" input_dim = input_record.items.metadata.categorical_limit assert input_dim is not None, "Unbounded features are not supported" self.output_schema = schema.Scalar( (np.float32, inner_shape), model.net.NextScopedBlob(name + '_output'), ) if self.request_only: schema.attach_metadata_to_scalars( self.output_schema, schema.Metadata(categorical_limit=None, expected_value=None, feature_specs=schema.FeatureSpec( feature_is_request_only=True))) scale = math.sqrt(1.0 / input_dim) self.shape = [input_dim] + inner_shape self.weight_init = weight_init if weight_init else ('UniformFill', { 'min': -scale, 'max': scale }) self.w = model.net.NextScopedBlob(name + "_w") self.params.append( LayerParameter(parameter=self.w, initializer=core.CreateOperator( self.weight_init[0], [], self.w, shape=self.shape, **self.weight_init[1]), optimizer=weight_optim)) if reducer == 'PositionWeighted': self.pos_w = model.net.NextScopedBlob(name + "_pos_w") self.params.append( LayerParameter(parameter=self.pos_w, initializer=core.CreateOperator('ConstantFill', [], self.pos_w, shape=[ input_dim, ], value=1.0), optimizer=weight_optim))
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, use_divide_mod=False, divisor=None, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) assert use_hashing + use_divide_mod < 2, "use_hashing and use_divide_mod cannot be set true at the same time." if use_divide_mod: assert divisor >= 1, 'Unexpected divisor: {}'.format(divisor) self.divisor = self.create_param( param_name='divisor', shape=[1], initializer=('GivenTensorInt64Fill', { 'values': np.array([divisor]) }), optimizer=model.NoOptim) self.seed = seed self.use_hashing = use_hashing self.use_divide_mod = use_divide_mod if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, expected_value=input_record.items.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdList) self.output_schema.items.set_metadata(metadata) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, expected_value=input_record.keys.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdScoreList) self.output_schema.keys.set_metadata(metadata) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) if input_record.lengths.metadata: self.output_schema.lengths.set_metadata( input_record.lengths.metadata) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def __init__(self, model, input_record, input_specs, name="feature_sparse_to_dense", default_dense_value=None, **kwargs): """ `input_specs` follows the format of FeatureSpec from schema. To be more precise it's a namedtuple that should have: 'feature_type', 'feature_names', 'feature_ids' Default_dense_value can only be 0.0 or float("NaN"). Any input that isn't None will be NaN. """ super(FeatureSparseToDense, self).__init__(model, name, input_record, **kwargs) if default_dense_value is None: default_dense_value = 0.0 default_dense_value = float(default_dense_value) assert (np.isnan(default_dense_value) or default_dense_value == 0.0), "default_dense_value can only be 0.0 or NaN" self.input_specs = input_specs self.default_float_value = (model.global_constants["NAN"] if np.isnan(default_dense_value) else model.global_constants["ZERO"]) self.zero_range = model.global_constants["ZERO_RANGE"] outputs = [] for field, feature_specs in self.input_specs: assert len(feature_specs.feature_names) == len( feature_specs.feature_ids) if feature_specs.feature_type == "FLOAT": outputs.append(( field, schema.Scalar( (np.float32, (len(feature_specs.feature_ids), )), self.get_next_blob_reference(field + "_output"), ), )) elif feature_specs.feature_type == "ID_LIST": outputs.append(( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "values", schema.Scalar( np.int64, self.get_next_blob_reference(field + "_values"), ), ), ), )) elif feature_specs.feature_type == "ID_SCORE_LIST": outputs.append(( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "ids", schema.Scalar( np.int64, self.get_next_blob_reference(field + "_ids"), ), ), ( "scores", schema.Scalar( np.float32, self.get_next_blob_reference(field + "_scores"), ), ), ), )) elif feature_specs.feature_type == "EMBEDDING": # We don't know dimensions of embeddings in input data. # Even though they should match dimensions from feature config, # we keep ranges blob to check input data later. outputs.append(( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "values", schema.Scalar( np.float32, self.get_next_blob_reference(field + "_values"), ), ), ), )) elif feature_specs.feature_type == "GENERIC_FEATURE": # We don't know dimensions of embeddings in input data. # Even though they should match dimensions from feature config, # we keep ranges blob to check input data later. # Currently this schema with ranges and values is only for # generic type enum 1. If new types are implemented, we need to # modify the ParseGeneric operator, and this part accordingly outputs.append(( field, schema.Struct( ( "ranges", schema.Scalar( (np.int32, (len(feature_specs.feature_ids), 2)), self.get_next_blob_reference(field + "_ranges"), ), ), ( "values", schema.Scalar( np.float32, self.get_next_blob_reference(field + "_values"), ), ), ), )) else: raise TypeError("Unsupported input type: {0}".format( feature_specs.feature_type)) # TODO(amalevich): This schema is producing ranges. And thus if there is # something using it it should support ranges as well. It might be # confusing, if we don't add better support for ranges/have it as a # first layer self.output_schema = schema.Struct(*outputs) # TODO(amalevich): Consider moving this data to schema, instead # Structs doesn't support attaching metadata to them and clonning # will break things badly, but this is the most elegant way to pass # this info around. Should we change it or it'll be too much work and # not worse it? for field, feature_specs in input_specs: schema.attach_metadata_to_scalars( self.output_schema[field], schema.Metadata(feature_specs=feature_specs))