def add_ops(self, net): net.Copy( self.input_record.lengths(), self.output_schema.lengths() ) if schema.equal_schemas(self.output_schema, IdList): input_blob = self.input_record.items() output_blob = self.output_schema.items() elif schema.equal_schemas(self.output_schema, IdScoreList): input_blob = self.input_record.keys() output_blob = self.output_schema.keys() net.Copy( self.input_record.values(), self.output_schema.values() ) else: raise NotImplementedError() if self.use_hashing: net.IndexHash( input_blob, output_blob, seed=self.seed, modulo=self.modulo ) else: net.Mod( input_blob, output_blob, divisor=self.modulo, sign_follow_divisor=True )
def add_ops(self, net): net.Copy(self.input_record.lengths(), self.output_schema.lengths()) if schema.equal_schemas(self.output_schema, IdList): input_blob = self.input_record.items() output_blob = self.output_schema.items() elif schema.equal_schemas(self.output_schema, IdScoreList): input_blob = self.input_record.keys() output_blob = self.output_schema.keys() net.Copy(self.input_record.values(), self.output_schema.values()) else: raise NotImplementedError() if self.use_hashing: net.IndexHash(input_blob, output_blob, seed=self.seed, modulo=self.modulo) else: if self.use_divide_mod: quotient = net.Div([input_blob, self.divisor], [net.NextScopedBlob('quotient')]) net.Mod(quotient, output_blob, divisor=self.modulo, sign_follow_divisor=True) else: net.Mod(input_blob, output_blob, divisor=self.modulo, sign_follow_divisor=True)
def add_ops(self, net): if schema.equal_schemas(self.output_schema, IdList): input_blobs = self.input_record.items.field_blobs() output_blobs = self.output_schema.items.field_blobs() net.Alias( self.input_record.lengths.field_blobs(), self.lengths_blob.field_blobs() ) elif schema.equal_schemas(self.output_schema, IdScoreList): input_blobs = self.input_record.keys.field_blobs() output_blobs = self.output_schema.keys.field_blobs() net.Alias( self.input_record.values.field_blobs(), self.values_blob.field_blobs() ) net.Alias( self.input_record.lengths.field_blobs(), self.lengths_blob.field_blobs() ) else: raise NotImplementedError() net.IndexHash(input_blobs, output_blobs, seed=self.seed, modulo=self.modulo)
def add_ops(self, net): if schema.equal_schemas(self.input_record, IdList): if self.reducer in ['Sum', 'Mean']: net.__getattr__('SparseLengths' + self.reducer)([ self.w, self.input_record.items(), self.input_record.lengths() ], self.output_schema.field_blobs(), engine='fp16') elif self.reducer == 'Sqrt': sqrt_weight = net.LengthsToWeights( [self.input_record.lengths()], [self.input_record.lengths() + '_sqrt'], power=0.5) net.SparseLengthsWeightedSum([ self.w, sqrt_weight, self.input_record.items(), self.input_record.lengths() ], self.output_schema.field_blobs(), engine='fp16') else: table_rows = net.Gather([self.w, self.input_record.items()]) segment_ids = net.LengthsToSegmentIds( self.input_record.lengths(), self.input_record.lengths() + '_sid') net.__getattr__('SortedSegmentRange' + self.reducer)([table_rows, segment_ids], self.output_schema.field_blobs(), engine='fp16') elif schema.equal_schemas(self.input_record, IdScoreList, check_field_types=False): if self.reducer in ['Sum', 'Mean']: net.__getattr__('SparseLengthsWeighted' + self.reducer)([ self.w, self.input_record.values(), self.input_record.keys(), self.input_record.lengths() ], self.output_schema.field_blobs(), engine='fp16') elif self.reducer == 'PositionWeighted': net.SparseLengthsWeightedSum([ self.w, self.external_weights, self.input_record.keys(), self.input_record.lengths() ], self.output_schema.field_blobs(), grad_on_weights=1, engine='fp16') else: raise "Only Sum, Mean is supported for IdScoreList input." +\ "Trying to create with {}".format(self.reducer) else: raise "Unsupported input type {0}".format(self.input_record)
def add_ops(self, net): if schema.equal_schemas(self.input_record, IdList): if self.reducer == 'Sum': net.SparseLengthsSum( [ self.w, self.input_record.items(), self.input_record.lengths() ], self.output_schema.field_blobs(), engine='fp16' ) elif self.reducer == 'PositionWeighted': inc_seq = net.LengthsRangeFill( [self.input_record.lengths()], self.input_record.lengths() + '_seq' ) gather_pos_w = net.Gather( [self.pos_w, inc_seq], self.pos_w + '_gather') net.SparseLengthsWeightedSum( [ self.w, gather_pos_w, self.input_record.items(), self.input_record.lengths() ], self.output_schema.field_blobs(), grad_on_weights=1, engine='fp16' ) else: table_rows = net.Gather([self.w, self.input_record.keys()]) segment_ids = net.LengthsToSegmentIds( self.input_record.lengths()) net.__getattr__('SortedSegmentRange' + self.reducer)( [table_rows, segment_ids], self.output_schema.field_blobs(), engine='fp16' ) elif schema.equal_schemas(self.input_record, IdScoreList): if self.reducer == 'Sum': net.SparseLengthsWeightedSum( [ self.w, self.input_record.values(), self.input_record.keys(), self.input_record.lengths() ], self.output_schema.field_blobs(), engine='fp16' ) else: raise "Only Sum is supported for IdScoreList input." +\ "Trying to create with {}".format(self.reducer) else: raise "Unsupported input type {0}".format(self.input_record)
def add_ops(self, net): if schema.equal_schemas(self.input_record, IdList): self._add_ops_id_list(net) elif schema.equal_schemas(self.input_record, IdScoreList, check_field_types=False): self._add_ops_id_score_list(net) else: raise "Unsupported input type {0}".format(self.input_record)
def get_key(record): if schema.equal_schemas(record, IdList): key = 'values' elif schema.equal_schemas(record, IdScoreList, check_field_types=False): key = 'values:keys' else: raise NotImplementedError('Not implemented for {}'.format(record)) assert record[key].metadata is not None, ( "Blob {} doesn't have metadata".format(str(record[key]()))) return record[key]
def get_categorical_limit(record): if schema.equal_schemas(record, IdList): key = 'items' elif schema.equal_schemas(record, IdScoreList, check_field_types=False): key = 'keys' else: raise NotImplementedError() assert record[key].metadata is not None, ( "Blob {} doesn't have metadata".format(str(record[key]()))) return record[key].metadata.categorical_limit
def get_key(record): if schema.equal_schemas(record, IdList): key = 'values' elif schema.equal_schemas(record, IdScoreList, check_field_types=False): key = 'values:keys' else: raise NotImplementedError('Not implemented for {}'.format(record)) assert record[key].metadata is not None, ( "Blob {} doesn't have metadata".format(str(record[key]()))) return record[key]
def __init__(self, model, input_record, inner_shape, reducer, weight_init=None, weight_optim=None, name='sparse_lookup', **kwargs): super(SparseLookup, self).__init__(model, name, input_record, **kwargs) # TODO Add some asserts about input type if isinstance(inner_shape, int): inner_shape = [inner_shape] assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\ "Unexpected type for inner_shape, expected list or tuple, got {0}".\ format(type(inner_shape)) if reducer == "PositionWeighted": self.external_weights = input_record.values() self.reducer = reducer input_dim = get_categorical_limit(input_record) assert input_dim is not None, "Unbounded features are not supported" scale = math.sqrt(1.0 / input_dim) self.shape = [input_dim] + inner_shape self.weight_init = weight_init if weight_init else ( 'UniformFill', {'min': -scale, 'max': scale}) if schema.equal_schemas(self.input_record, IdList): sparse_key = self.input_record.items() elif schema.equal_schemas( self.input_record, IdScoreList, check_field_types=False): sparse_key = self.input_record.keys() else: raise NotImplementedError() if self.input_record.lengths.metadata: avg_length = self.input_record.lengths.metadata.expected_value else: avg_length = None self.w = self.create_param(param_name='w', shape=self.shape, initializer=self.weight_init, optimizer=weight_optim, ps_param=LayerPsParam( sparse_key=sparse_key, average_length=avg_length )) self.output_schema = schema.Scalar( (np.float32, inner_shape), self.get_next_blob_reference('output'), )
def add_ops(self, net): if schema.equal_schemas(self.input_record, IdList): if self.reducer == 'Sum': net.SparseLengthsSum( [ self.w, self.input_record.items(), self.input_record.lengths() ], self.output_schema.field_blobs() ) elif self.reducer == 'PositionWeighted': inc_seq = net.LengthsRangeFill( [self.input_record.lengths()], self.input_record.lengths() + '_seq' ) gather_pos_w = net.Gather( [self.pos_w, inc_seq], self.pos_w + '_gather') net.SparseLengthsWeightedSum( [ self.w, gather_pos_w, self.input_record.items(), self.input_record.lengths() ], self.output_schema.field_blobs(), grad_on_weights=1 ) else: table_rows = net.Gather([self.w, self.input_record.keys()]) segment_ids = net.LengthsToSegmentIds( self.input_record.lengths()) net.__getattr__('SortedSegmentRange' + self.reducer)( [table_rows, segment_ids], self.output_schema.field_blobs() ) elif schema.equal_schemas(self.input_record, IdScoreList): if self.reducer == 'Sum': net.SparseLengthsWeightedSum( [ self.w, self.input_record.values(), self.input_record.keys(), self.input_record.lengths() ], self.output_schema.field_blobs() ) else: raise "Only Sum is supported for IdScoreList input." +\ "Trying to create with {}".format(self.reducer) else: raise "Unsupported input type {0}".format(self.input_record)
def add_ops(self, net): if schema.equal_schemas(self.output_schema, IdList): input_blobs = self.input_record.items.field_blobs() output_blobs = self.output_schema.items.field_blobs() elif schema.equal_schemas(self.output_schema, IdScoreList): input_blobs = self.input_record.keys.field_blobs() output_blobs = self.output_schema.keys.field_blobs() else: raise NotImplementedError() net.IndexHash(input_blobs, output_blobs, seed=self.seed, modulo=self.modulo)
def add_ops(self, net): cur_scope = get_current_scope() version = get_sparse_lookup_predictor_version(**cur_scope.get( get_sparse_lookup_predictor_version.__name__, {'version': 'fp32'})) if schema.equal_schemas(self.input_record, IdList): self._add_ops_id_list(net, version=version) elif schema.equal_schemas(self.input_record, IdScoreList, check_field_types=False): self._add_ops_id_score_list(net, version=version) else: raise "Unsupported input type {0}".format(self.input_record)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=input_record.lengths, ) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=input_record.values, lengths_blob=input_record.lengths, ) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def add_ops(self, net): cur_scope = get_current_scope() version = get_sparse_lookup_predictor_version( **cur_scope.get(get_sparse_lookup_predictor_version.__name__, {'version': 'fp32'})) if schema.equal_schemas(self.input_record, IdList): self._add_ops_id_list(net, version=version) elif schema.equal_schemas(self.input_record, IdScoreList, check_field_types=False): self._add_ops_id_score_list(net, version=version) else: raise "Unsupported input type {0}".format(self.input_record)
def __init__(self, model, input_record, seed, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.lengths_blob = schema.Scalar( np.int32, model.net.NextScopedBlob(name + "_lengths"), ) if schema.equal_schemas(input_record, IdList): self.modulo = self.extract_hash_size(input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, model.net.NextScopedBlob(name + "_hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=self.lengths_blob, ) elif schema.equal_schemas(input_record, IdScoreList): self.values_blob = schema.Scalar( np.float32, model.net.NextScopedBlob(name + "_values"), ) self.modulo = self.extract_hash_size(input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, model.net.NextScopedBlob(name + "_hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=self.values_blob, lengths_blob=self.lengths_blob, ) else: assert False, "Input type must be one of (IdList, IdScoreList)"
def __init__(self, model, input_record, name='select_record_by_context', check_field_metas=True, use_copy=False, default_output_record_field=None, **kwargs): super(SelectRecordByContext, self).__init__(model, name, input_record, **kwargs) assert isinstance(input_record, schema.Struct) assert len(input_record) > 1 self.use_copy = use_copy self.default_output_record = ( input_record[default_output_record_field] if (default_output_record_field is not None) else None) ref_record = input_record[0] for record in input_record: assert schema.equal_schemas(record, ref_record, check_field_metas=check_field_metas) self.output_schema = schema.NewRecord(model.net, ref_record)
def __init__( self, model, input_record, name='select_record_by_context', check_field_metas=True, use_copy=False, default_output_record_field=None, **kwargs ): super(SelectRecordByContext, self).__init__(model, name, input_record, **kwargs) assert isinstance(input_record, schema.Struct) assert len(input_record) > 1 self.use_copy = use_copy self.default_output_record = ( input_record[default_output_record_field] if (default_output_record_field is not None) else None ) ref_record = input_record[0] for record in input_record: assert schema.equal_schemas(record, ref_record, check_field_metas=check_field_metas) self.output_schema = schema.NewRecord(model.net, ref_record)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, expected_value=input_record.items.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdList) self.output_schema.items.set_metadata(metadata) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, expected_value=input_record.keys.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdScoreList) self.output_schema.keys.set_metadata(metadata) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) if input_record.lengths.metadata: self.output_schema.lengths.set_metadata( input_record.lengths.metadata) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def __init__(self, model, input_record, seed, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.lengths_blob = schema.Scalar( np.int32, self.get_next_blob_reference("lengths"), ) if schema.equal_schemas(input_record, IdList): self.modulo = self.extract_hash_size(input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx") ) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=self.lengths_blob, ) elif schema.equal_schemas(input_record, IdScoreList): self.values_blob = schema.Scalar( np.float32, self.get_next_blob_reference("values"), ) self.modulo = self.extract_hash_size(input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx") ) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=self.values_blob, lengths_blob=self.lengths_blob, ) else: assert False, "Input type must be one of (IdList, IdScoreList)"
def add_ops(self, net): if schema.equal_schemas(self.output_schema, IdList): input_blob = self.input_record.items() output_blob = self.output_schema.items() elif schema.equal_schemas(self.output_schema, IdScoreList): input_blob = self.input_record.keys() output_blob = self.output_schema.keys() else: raise NotImplementedError() if self.use_hashing: net.IndexHash(input_blob, output_blob, seed=self.seed, modulo=self.modulo) else: net.Mod(input_blob, output_blob, divisor=self.modulo)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=input_record.lengths, ) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx")) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=input_record.values, lengths_blob=input_record.lengths, ) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size(input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx") ) hashed_indices.set_metadata(metadata) self.output_schema = schema.List( values=hashed_indices, lengths_blob=input_record.lengths, ) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size(input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, ) hashed_indices = schema.Scalar( np.int64, self.get_next_blob_reference("hashed_idx") ) hashed_indices.set_metadata(metadata) self.output_schema = schema.Map( keys=hashed_indices, values=input_record.values, lengths_blob=input_record.lengths, ) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) self.seed = seed self.use_hashing = use_hashing if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size(input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, expected_value=input_record.items.metadata.expected_value ) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdList) self.output_schema.items.set_metadata(metadata) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size(input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, expected_value=input_record.keys.metadata.expected_value ) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdScoreList) self.output_schema.keys.set_metadata(metadata) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) if input_record.lengths.metadata: self.output_schema.lengths.set_metadata(input_record.lengths.metadata) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def __init__(self, model, input_record, name='select_record_by_context', check_field_metas=True, **kwargs): super(SelectRecordByContext, self).__init__(model, name, input_record, **kwargs) assert isinstance(input_record, schema.Struct) assert len(input_record) > 1 ref_record = input_record[0] for record in input_record: assert schema.equal_schemas(record, ref_record, check_field_metas=check_field_metas) self.output_schema = schema.NewRecord(model.net, ref_record)
def __init__(self, model, input_record, name='select_record_by_context', check_field_metas=True, **kwargs): super(SelectRecordByContext, self).__init__(model, name, input_record, **kwargs) assert isinstance(input_record, schema.Struct) assert len(input_record) > 1 ref_record = input_record[0] for record in input_record: assert schema.equal_schemas(record, ref_record, check_field_metas=check_field_metas) self.output_schema = schema.NewRecord(model.net, ref_record)
def almost_equal_schemas( record, original_schema, check_field_names=True, check_field_types=True, check_field_metas=False, ): if original_schema == IdList: return schema.equal_schemas( record, IdList, check_field_names=check_field_names, check_field_types=check_field_types, check_field_metas=check_field_metas, ) or schema.equal_schemas( record, IdListWithEvicted, check_field_names=check_field_names, check_field_types=check_field_types, check_field_metas=check_field_metas, ) elif original_schema == IdScoreList: return schema.equal_schemas( record, IdScoreList, check_field_names=check_field_names, check_field_types=check_field_types, check_field_metas=check_field_metas, ) or schema.equal_schemas( record, IdScoreListWithEvicted, check_field_names=check_field_names, check_field_types=check_field_types, check_field_metas=check_field_metas, ) else: return schema.equal_schemas(record, original_schema)
def __init__(self, model, input_record, dropout_prob_train, dropout_prob_eval, dropout_prob_predict, replacement_value, name='sparse_dropout', **kwargs): super(SparseDropoutWithReplacement, self).__init__(model, name, input_record, **kwargs) assert schema.equal_schemas(input_record, IdList), "Incorrect input type" self.dropout_prob_train = float(dropout_prob_train) self.dropout_prob_eval = float(dropout_prob_eval) self.dropout_prob_predict = float(dropout_prob_predict) self.replacement_value = int(replacement_value) assert (self.dropout_prob_train >= 0 and self.dropout_prob_train <= 1.0), \ "Expected 0 <= dropout_prob_train <= 1, but got %s" \ % self.dropout_prob_train assert (self.dropout_prob_eval >= 0 and self.dropout_prob_eval <= 1.0), \ "Expected 0 <= dropout_prob_eval <= 1, but got %s" \ % dropout_prob_eval assert (self.dropout_prob_predict >= 0 and self.dropout_prob_predict <= 1.0), \ "Expected 0 <= dropout_prob_predict <= 1, but got %s" \ % dropout_prob_predict assert(self.dropout_prob_train > 0 or self.dropout_prob_eval > 0 or self.dropout_prob_predict > 0), \ "Ratios all set to 0.0 for train, eval and predict" self.output_schema = schema.NewRecord(model.net, IdList) if input_record.lengths.metadata: self.output_schema.lengths.set_metadata( input_record.lengths.metadata) if input_record.items.metadata: self.output_schema.items.set_metadata(input_record.items.metadata)
def __init__(self, model, input_record, name='merged'): super(MergeIdLists, self).__init__(model, name, input_record) assert all(schema.equal_schemas(x, IdList) for x in input_record), \ "Inputs to MergeIdLists should all be IdLists." assert all(record.items.metadata is not None for record in self.input_record), \ "Features without metadata are not supported" merge_dim = max(get_categorical_limit(record) for record in self.input_record) assert merge_dim is not None, "Unbounded features are not supported" self.output_schema = schema.NewRecord( model.net, schema.List( schema.Scalar( np.int64, blob=model.net.NextBlob(name), metadata=schema.Metadata(categorical_limit=merge_dim) )))
def __init__(self, model, input_record, name='merged'): super(MergeIdLists, self).__init__(model, name, input_record) assert all(schema.equal_schemas(x, IdList) for x in input_record), \ "Inputs to MergeIdLists should all be IdLists." assert all(record.items.metadata is not None for record in self.input_record), \ "Features without metadata are not supported" merge_dim = max( get_categorical_limit(record) for record in self.input_record) assert merge_dim is not None, "Unbounded features are not supported" self.output_schema = schema.NewRecord( model.net, schema.List( schema.Scalar( np.int64, blob=model.net.NextBlob(name), metadata=schema.Metadata(categorical_limit=merge_dim))))
def testMergeIdListsLayer(self, num_inputs, batch_size): inputs = [] for _ in range(num_inputs): lengths = np.random.randint(5, size=batch_size).astype(np.int32) size = lengths.sum() values = np.random.randint(1, 10, size=size).astype(np.int64) inputs.append(lengths) inputs.append(values) input_schema = schema.Tuple(*[ schema.List( schema.Scalar(dtype=np.int64, metadata=schema.Metadata(categorical_limit=20))) for _ in range(num_inputs) ]) input_record = schema.NewRecord(self.model.net, input_schema) schema.FeedRecord(input_record, inputs) output_schema = self.model.MergeIdLists(input_record) assert schema.equal_schemas(output_schema, IdList, check_field_names=False)
def testMergeIdListsLayer(self, num_inputs, batch_size): inputs = [] for _ in range(num_inputs): lengths = np.random.randint(5, size=batch_size).astype(np.int32) size = lengths.sum() values = np.random.randint(1, 10, size=size).astype(np.int64) inputs.append(lengths) inputs.append(values) input_schema = schema.Tuple( *[schema.List( schema.Scalar(dtype=np.int64, metadata=schema.Metadata( categorical_limit=20 ))) for _ in range(num_inputs)] ) input_record = schema.NewRecord(self.model.net, input_schema) schema.FeedRecord(input_record, inputs) output_schema = self.model.MergeIdLists(input_record) assert schema.equal_schemas( output_schema, IdList, check_field_names=False)
def __init__(self, model, input_record, inner_shape, reducer, weight_init=None, weight_optim=None, name='sparse_lookup', **kwargs): super(SparseLookup, self).__init__(model, name, input_record, **kwargs) # TODO Add some asserts about input type if isinstance(inner_shape, int): inner_shape = [inner_shape] assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\ "Unexpected type for inner_shape, expected list or tuple, got {0}".\ format(type(inner_shape)) if reducer == "PositionWeighted": self.external_weights = input_record.values() self.reducer = reducer input_dim = get_categorical_limit(input_record) assert input_dim is not None, "Unbounded features are not supported" scale = math.sqrt(1.0 / input_dim) self.shape = [input_dim] + inner_shape self.weight_init = weight_init if weight_init else ( 'UniformFill', {'min': -scale, 'max': scale}) if schema.equal_schemas(self.input_record, IdList): sparse_key = self.input_record.items() elif schema.equal_schemas( self.input_record, IdScoreList, check_field_types=False): sparse_key = self.input_record.keys() else: raise NotImplementedError() if self.input_record.lengths.metadata: avg_length = self.input_record.lengths.metadata.expected_value else: avg_length = None self.w = self.create_param( param_name='w', shape=self.shape, initializer=self.weight_init, optimizer=weight_optim, ps_param=LayerPsParam( sparse_key=sparse_key, average_length=avg_length)) self.scale_bias_init = ('ConstantFill', {'value': 0.0}) self.scale_bias = self.create_param( param_name='scale_bias', shape=[], initializer=self.scale_bias_init, optimizer=model.NoOptim) self.output_schema = schema.Scalar( (np.float32, inner_shape), self.get_next_blob_reference('output'), )
def _is_id_score_list(input_record): return schema.equal_schemas(input_record, IdScoreList, check_field_types=False)
def _is_id_list(input_record): return schema.equal_schemas(input_record, IdList)
def __init__(self, model, input_record, inner_shape, reducer, weight_init=None, weight_optim=None, name='sparse_lookup', **kwargs): super(SparseLookup, self).__init__(model, name, input_record, **kwargs) if isinstance(inner_shape, int): inner_shape = [inner_shape] assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\ "Unexpected type for inner_shape, expected list or tuple, got {0}".\ format(type(inner_shape)) # TODO Add some asserts about input type assert reducer in self._supported_reducers, "Unsupported reducer: {}".\ format(reducer) self.reducer = reducer input_dim = get_categorical_limit(input_record) assert input_dim is not None, "Unbounded features are not supported" self.output_schema = schema.Scalar( (np.float32, inner_shape), model.net.NextScopedBlob(name + '_output'), ) scale = math.sqrt(1.0 / input_dim) self.shape = [input_dim] + inner_shape self.weight_init = weight_init if weight_init else ( 'UniformFill', {'min': -scale, 'max': scale}) self.w = model.net.NextScopedBlob(name + "_w") if schema.equal_schemas(self.input_record, IdList): sparse_key = self.input_record.items() elif schema.equal_schemas(self.input_record, IdScoreList): sparse_key = self.input_record.keys() else: raise NotImplementedError() if self.input_record.lengths.metadata: avg_length = self.input_record.lengths.metadata.expected_value else: avg_length = None self.params.append( LayerParameter( parameter=self.w, initializer=core.CreateOperator(self.weight_init[0], [], self.w, shape=self.shape, **self.weight_init[1] ), optimizer=weight_optim, ps_param=LayerPsParam( sparse_key=sparse_key, average_length=avg_length ) )) if reducer == 'PositionWeighted': self.pos_w = model.net.NextScopedBlob(name + "_pos_w") self.params.append( LayerParameter( parameter=self.pos_w, initializer=core.CreateOperator('ConstantFill', [], self.pos_w, shape=[input_dim, ], value=1.0 ), optimizer=weight_optim ))
def testGatherRecord(self): indices = np.array([1, 3, 4], dtype=np.int32) dense = np.array(list(range(20)), dtype=np.float32).reshape(10, 2) lengths = np.array(list(range(10)), dtype=np.int32) items = np.array(list(range(lengths.sum())), dtype=np.int64) items_lengths = np.array(list(range(lengths.sum())), dtype=np.int32) items_items = np.array(list(range(items_lengths.sum())), dtype=np.int64) record = self.new_record(schema.Struct( ('dense', schema.Scalar(np.float32)), ('sparse', schema.Struct( ('list', schema.List(np.int64)), ('list_of_list', schema.List(schema.List(np.int64))), )), ('empty_struct', schema.Struct()) )) indices_record = self.new_record(schema.Scalar(np.int32)) input_record = schema.Struct( ('indices', indices_record), ('record', record), ) schema.FeedRecord( input_record, [indices, dense, lengths, items, lengths, items_lengths, items_items]) gathered_record = self.model.GatherRecord(input_record) self.assertTrue(schema.equal_schemas(gathered_record, record)) self.run_train_net_forward_only() gathered_dense = workspace.FetchBlob(gathered_record.dense()) np.testing.assert_array_equal( np.concatenate([dense[i:i + 1] for i in indices]), gathered_dense) gathered_lengths = workspace.FetchBlob( gathered_record.sparse.list.lengths()) np.testing.assert_array_equal( np.concatenate([lengths[i:i + 1] for i in indices]), gathered_lengths) gathered_items = workspace.FetchBlob( gathered_record.sparse.list.items()) offsets = lengths.cumsum() - lengths np.testing.assert_array_equal( np.concatenate([ items[offsets[i]: offsets[i] + lengths[i]] for i in indices ]), gathered_items) gathered_items_lengths = workspace.FetchBlob( gathered_record.sparse.list_of_list.items.lengths()) np.testing.assert_array_equal( np.concatenate([ items_lengths[offsets[i]: offsets[i] + lengths[i]] for i in indices ]), gathered_items_lengths ) nested_offsets = [] nested_lengths = [] nested_offset = 0 j = 0 for l in lengths: nested_offsets.append(nested_offset) nested_length = 0 for _i in range(l): nested_offset += items_lengths[j] nested_length += items_lengths[j] j += 1 nested_lengths.append(nested_length) gathered_items_items = workspace.FetchBlob( gathered_record.sparse.list_of_list.items.items()) np.testing.assert_array_equal( np.concatenate([ items_items[nested_offsets[i]: nested_offsets[i] + nested_lengths[i]] for i in indices ]), gathered_items_items )
def get_fp16_compatible_parameters(self): if (self.reducer == 'Sum' and schema.equal_schemas(self.input_record, IdList)): return [self.w] return []
def _is_id_score_list(input_record): return schema.equal_schemas(input_record, IdScoreList, check_field_types=False)
def __init__(self, model, input_record, seed=0, modulo=None, use_hashing=True, use_divide_mod=False, divisor=None, name='sparse_feature_hash', **kwargs): super(SparseFeatureHash, self).__init__(model, name, input_record, **kwargs) assert use_hashing + use_divide_mod < 2, "use_hashing and use_divide_mod cannot be set true at the same time." if use_divide_mod: assert divisor >= 1, 'Unexpected divisor: {}'.format(divisor) self.divisor = self.create_param( param_name='divisor', shape=[1], initializer=('GivenTensorInt64Fill', { 'values': np.array([divisor]) }), optimizer=model.NoOptim) self.seed = seed self.use_hashing = use_hashing self.use_divide_mod = use_divide_mod if schema.equal_schemas(input_record, IdList): self.modulo = modulo or self.extract_hash_size( input_record.items.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.items.metadata.feature_specs, expected_value=input_record.items.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdList) self.output_schema.items.set_metadata(metadata) elif schema.equal_schemas(input_record, IdScoreList): self.modulo = modulo or self.extract_hash_size( input_record.keys.metadata) metadata = schema.Metadata( categorical_limit=self.modulo, feature_specs=input_record.keys.metadata.feature_specs, expected_value=input_record.keys.metadata.expected_value) with core.NameScope(name): self.output_schema = schema.NewRecord(model.net, IdScoreList) self.output_schema.keys.set_metadata(metadata) else: assert False, "Input type must be one of (IdList, IdScoreList)" assert self.modulo >= 1, 'Unexpected modulo: {}'.format(self.modulo) if input_record.lengths.metadata: self.output_schema.lengths.set_metadata( input_record.lengths.metadata) # operators in this layer do not have CUDA implementation yet. # In addition, since the sparse feature keys that we are hashing are # typically on CPU originally, it makes sense to have this layer on CPU. self.tags.update([Tags.CPU_ONLY])
def __init__(self, model, input_record, inner_shape, reducer, weight_init=None, weight_optim=None, name='sparse_lookup', regularizer=None, **kwargs): super(SparseLookup, self).__init__(model, name, input_record, **kwargs) self.sparse_key = get_key(self.input_record)() logger.info("Setup the sparse lookup layer for " + self.sparse_key) # TODO Add some asserts about input type if isinstance(inner_shape, int): inner_shape = [inner_shape] assert isinstance(inner_shape, list) or isinstance(inner_shape, tuple),\ "Unexpected type for inner_shape, expected list or tuple, got {0} for {1}".\ format(type(inner_shape), self.sparse_key) if reducer == "PositionWeighted": assert _is_id_score_list(self.input_record), ( "PositionWeighted only support IdScoreList, but got {} for {}" + "please use PositionWeighted layer to convert IdList " + "to IdScoreList").format(repr(self.input_record), self.sparse_key) self.external_weights = self.input_record.values() elif reducer == "RecencyWeighted": assert _is_id_score_list(self.input_record), ( "RecencyWeighted only supports IdScoreList, " "while the sparse feature {} is not.".format(self.sparse_key)) self.external_weights = self.input_record.values() self.reducer = reducer input_dim = get_categorical_limit(self.input_record) assert input_dim > 0, ( "{} should have categorical limit > 0, but got {}".format( self.sparse_key, input_dim)) self.input_dim = input_dim self.shape = [input_dim] + inner_shape self.trainer_version = get_trainer_version_based_on_optim(weight_optim) default_init_op = self._get_default_init_op() self.weight_init = weight_init or default_init_op self.evicted_values = None if schema.equal_schemas(self.input_record, IdListWithEvicted) or \ schema.equal_schemas(self.input_record, IdScoreListWithEvicted, check_field_types=False): self.evicted_values = self.input_record._evicted_values # If fp16 is used, make sure fp16 init op is used if self.trainer_version == "fp16": assert self.reducer in self._fp16_compatible_reducers, ( "Fp16 training is enabled. The reducer specified is not supported. " "Got {}. Supported reducers: {}. Right now, in general, sum, mean, " "positional pooling are supported. Attention is not. Please check " "if there is fp16 trained sparse features using advanced pooling." .format(self.reducer, self._fp16_compatible_reducers)) # if init op is UniformFill, we replace it directly if self.weight_init[0] == "UniformFill": self.weight_init = ("Float16UniformFill", self.weight_init[1]) assert self.weight_init[0] in self._fp16_compatible_init_op_types, ( "Fp16 training is enabled. Init op for weight parameter must be fp16 " "compatibale. Got {}. Supported ops: {}".format( self.weight_init[0], self._fp16_compatible_init_op_types)) assert regularizer is None, "Regularizer is not compatible with fp16" if self.input_record.lengths.metadata: avg_length = self.input_record.lengths.metadata.expected_value else: avg_length = None self.w = self.create_param(param_name='w', shape=self.shape, initializer=self.weight_init, optimizer=weight_optim, ps_param=LayerPsParam( sparse_key=self.sparse_key, average_length=avg_length), regularizer=regularizer) if self.evicted_values: self.reinit_vec = self.create_param( param_name="reinit_vec", shape=inner_shape, initializer=self.weight_init, optimizer=model.NoOptim, regularizer=None, ) self.scale_bias_init = ('ConstantFill', {'value': 0.0}) self.scale_bias = self.create_param( param_name='scale_bias', shape=[], initializer=self.scale_bias_init, optimizer=model.NoOptim, ) self.output_schema = schema.Scalar( (np.float32, inner_shape), self.get_next_blob_reference('output'), )
def testGatherRecord(self): indices = np.array([1, 3, 4], dtype=np.int32) dense = np.array(range(20), dtype=np.float32).reshape(10, 2) lengths = np.array(range(10), dtype=np.int32) items = np.array(range(lengths.sum()), dtype=np.int64) items_lengths = np.array(range(lengths.sum()), dtype=np.int32) items_items = np.array(range(items_lengths.sum()), dtype=np.int64) record = self.new_record( schema.Struct( ('dense', schema.Scalar(np.float32)), ('sparse', schema.Struct( ('list', schema.List(np.int64)), ('list_of_list', schema.List(schema.List(np.int64))), )), ('empty_struct', schema.Struct()))) indices_record = self.new_record(schema.Scalar(np.int32)) input_record = schema.Struct( ('indices', indices_record), ('record', record), ) schema.FeedRecord(input_record, [ indices, dense, lengths, items, lengths, items_lengths, items_items ]) gathered_record = self.model.GatherRecord(input_record) self.assertTrue(schema.equal_schemas(gathered_record, record)) self.run_train_net_forward_only() gathered_dense = workspace.FetchBlob(gathered_record.dense()) np.testing.assert_array_equal( np.concatenate([dense[i:i + 1] for i in indices]), gathered_dense) gathered_lengths = workspace.FetchBlob( gathered_record.sparse.list.lengths()) np.testing.assert_array_equal( np.concatenate([lengths[i:i + 1] for i in indices]), gathered_lengths) gathered_items = workspace.FetchBlob( gathered_record.sparse.list.items()) offsets = lengths.cumsum() - lengths np.testing.assert_array_equal( np.concatenate( [items[offsets[i]:offsets[i] + lengths[i]] for i in indices]), gathered_items) gathered_items_lengths = workspace.FetchBlob( gathered_record.sparse.list_of_list.items.lengths()) np.testing.assert_array_equal( np.concatenate([ items_lengths[offsets[i]:offsets[i] + lengths[i]] for i in indices ]), gathered_items_lengths) nested_offsets = [] nested_lengths = [] nested_offset = 0 j = 0 for l in lengths: nested_offsets.append(nested_offset) nested_length = 0 for _i in range(l): nested_offset += items_lengths[j] nested_length += items_lengths[j] j += 1 nested_lengths.append(nested_length) gathered_items_items = workspace.FetchBlob( gathered_record.sparse.list_of_list.items.items()) np.testing.assert_array_equal( np.concatenate([ items_items[nested_offsets[i]:nested_offsets[i] + nested_lengths[i]] for i in indices ]), gathered_items_items)
def _is_id_list(input_record): return schema.equal_schemas(input_record, IdList)