def transform(inputs, num_cols, cat_cols): print("Inputs before features transformation: {}".format(inputs.keys())) # Pass-through columns transformed = inputs.copy() feature_columns = { colname: tf.feature_column.numeric_column(colname) for colname in num_cols } # Add Euclidean distance transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([ inputs['pickuplon'], inputs['pickuplat'], inputs['dropofflon'], inputs['dropofflat'] ]) feature_columns['euclidean'] = fc.numeric_column('euclidean') # Shift 'dayofweek' feature to a value range of 0-6 transformed['dayofweek'] = transformed['dayofweek'] - 1 # Create categorical columns (wrapped in indicator columns) feature_columns['hourofday'] = fc.indicator_column( fc.categorical_column_with_identity('hourofday', 24)) feature_columns['dayofweek'] = fc.indicator_column( fc.categorical_column_with_identity('dayofweek', 7)) print("Transformed features: {}".format(transformed.keys())) print("Feature columns: {}".format(feature_columns.keys())) return transformed, feature_columns
def generateFeatureColumn(self): for columnName in self.featureList: if columnName != "prognosis": self.featureColumn.append( feature_column.categorical_column_with_identity( key=columnName, num_buckets=2)) else: self.featureColumn.append( (feature_column.categorical_column_with_identity( key=columnName, num_buckets=41)))
def create_user_feature_columns(): gender = fc.indicator_column(fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) age_class = fc.indicator_column(fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0)) has_baby = fc.indicator_column(fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0)) baby_gender = fc.indicator_column(fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0)) baby_age = fc.indicator_column(fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0)) grade = fc.indicator_column(fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0)) rfm_type = fc.indicator_column(fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0)) cate1_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate1_price_prefer", num_buckets=6, default_value=0)) cate2_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate2_price_prefer", num_buckets=6, default_value=0)) cate3_price_prefer = fc.indicator_column(fc.categorical_column_with_identity("cate3_price_prefer", num_buckets=6, default_value=0)) city_id = fc.categorical_column_with_hash_bucket("city", 700) city = fc.shared_embedding_columns([city_id], 16) cols = [gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, cate1_price_prefer, cate2_price_prefer, cate3_price_prefer] return cols + city
def build_features(statistics): pu_location_id = fc.categorical_column_with_identity(key='PULocationID', num_buckets=265) do_location_id = fc.categorical_column_with_identity(key='DOLocationID', num_buckets=265) day_of_week = fc.categorical_column_with_identity(key='day_of_week', num_buckets=7) weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2) speed_buckets = fc.bucketized_column( fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70]) distance_buckets = fc.bucketized_column( fc.numeric_column('trip_distance'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) duration_buckets = fc.bucketized_column( fc.numeric_column('duration'), boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500]) fare_buckets = fc.bucketized_column( fc.numeric_column('fare_amount'), boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30]) passenger_buckets = fc.bucketized_column( fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9]) location = fc.crossed_column([pu_location_id, do_location_id], hash_bucket_size=1000) cross_all = fc.crossed_column([ location, speed_buckets, distance_buckets, duration_buckets, fare_buckets, passenger_buckets ], hash_bucket_size=1000) categorical_columns = [ fc.embedding_column(pu_location_id, dimension=32), fc.embedding_column(do_location_id, dimension=32), fc.indicator_column(day_of_week), fc.indicator_column(weekend) ] numeric_columns = [ custom_numeric_column('passenger_count', statistics), custom_numeric_column('trip_distance', statistics), custom_numeric_column('fare_amount', statistics), custom_numeric_column('extra', statistics), custom_numeric_column('mta_tax', statistics), custom_numeric_column('tolls_amount', statistics), custom_numeric_column('improvement_surcharge', statistics), custom_numeric_column('duration', statistics), custom_numeric_column('speed', statistics) ] dnn_feature_columns = numeric_columns + categorical_columns linear_feature_columns = [location, cross_all] return dnn_feature_columns, linear_feature_columns
def _prepare_for_crossing(self, key_name, num_bck, boundaries): """Prepares features for crossing. Whether they're continuous or categorical matters, and whether we have the whole dictionary or not. Args: key_name: A string representing the name of the feature num_bck: How many buckets to use when we know # of distinct values boundaries: Range used for boundaries when bucketinizing Returns: key name """ key = None if key_name in self.continuous.keys(): if boundaries is not None: # Note that cont[key_name] is a source column key = tfc.bucketized_column(self.continuous[key_name], boundaries) else: # We can count all the values in the dataset. Ex: boolean. # Note that key_name is a string key = tfc.categorical_column_with_identity(key_name, num_bck) elif key_name in self.categorical.keys(): # It is also possible to use the categorical column instead of the # column name. i.e key = cat[key_name] key = key_name else: key = key_name return key
def tf_inputs_dataframe(self, batch_size=1, buffer_size=1000): dataframe = read_csv( os.path.join(os.path.dirname(self.json_filename), self.data_description["csv"])) labels_name = 'ga_edd' y_name = labels_name for column_name in dataframe.columns: if column_name.startswith('_'): dataframe.pop(column_name) for header in [ 'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb', 'mom_height_in' ]: r = max(dataframe[header]) - min(dataframe[header]) dataframe[header] = (dataframe[header] - min(dataframe[header])) / r dataframe = dataframe[(dataframe[y_name] != '.') & (notna(dataframe[y_name])) & (notnull(dataframe[y_name]))].copy() dataframe = dataframe.astype({y_name: 'int32'}) feature_columns = [] feature_names = [] num_channels = 0 for header in [ 'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb', 'mom_height_in' ]: feature_columns.append(feature_column.numeric_column(header)) feature_names.append(header) num_channels += 1 num_identity = 2 for header in [ 'hiv', 'current_smoker', 'former_smoker', 'chronic_htn', 'preg_induced_htn', 'diabetes', 'gest_diabetes' ]: col = feature_column.categorical_column_with_identity( header, num_identity) col = feature_column.indicator_column(col) feature_columns.append(col) feature_names.append(header) num_channels += num_identity self.num_channels = num_channels feature_layer = tf.keras.layers.DenseFeatures( feature_columns=feature_columns) dataframe = dataframe.copy() labels = dataframe.pop(labels_name) dataset = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) dataset = dataset.shuffle(buffer_size=buffer_size) dataset = dataset.batch(batch_size) dataset = dataset.map(lambda x, y: (feature_layer(x), y)) return dataset
def test_identity_feature_column(): sample = {'price': [[1], [2], [3], [0]]} # price_column = feature_column.numeric_column('price') price_column = feature_column.categorical_column_with_identity( key='price', num_buckets=4) indicator = feature_column.indicator_column(price_column) price_column_tensor = feature_column.input_layer(sample, [indicator]) with tf.Session() as session: print(session.run([price_column_tensor]))
def create_deep_feature_columns(): phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 41, default_value=40)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 41, default_value=40)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) columns = [matchScore, matchType, triggerNum, triggerRank, sceneType, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] print("deep feature columns:", columns) return columns
def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets): # Pass-through columns transformed = inputs.copy() del transformed['pickup_datetime'] feature_columns = { colname: fc.numeric_column(colname) for colname in NUMERIC_COLS } # Scaling longitude from range [-70, -78] to [0, 1] for lon_col in ['pickup_longitude', 'dropoff_longitude']: transformed[lon_col] = layers.Lambda(lambda x: (x + 78) / 8.0, name='scale_{}'.format(lon_col))( inputs[lon_col]) # Scaling latitude from range [37, 45] to [0, 1] for lat_col in ['pickup_latitude', 'dropoff_latitude']: transformed[lat_col] = layers.Lambda(lambda x: (x - 37) / 8.0, name='scale_{}'.format(lat_col))( inputs[lat_col]) # Adding Euclidean dist (no need to be accurate: NN will calibrate it) transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([ inputs['pickup_longitude'], inputs['pickup_latitude'], inputs['dropoff_longitude'], inputs['dropoff_latitude'] ]) feature_columns['euclidean'] = fc.numeric_column('euclidean') # hour of day from timestamp of form '2010-02-08 09:17:00+00:00' transformed['hourofday'] = layers.Lambda( lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2), out_type=tf.dtypes.int32), name='hourofday')(inputs['pickup_datetime']) feature_columns['hourofday'] = fc.indicator_column( fc.categorical_column_with_identity('hourofday', num_buckets=24)) latbuckets = np.linspace(0, 1, nbuckets).tolist() lonbuckets = np.linspace(0, 1, nbuckets).tolist() b_plat = fc.bucketized_column(feature_columns['pickup_latitude'], latbuckets) b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'], latbuckets) b_plon = fc.bucketized_column(feature_columns['pickup_longitude'], lonbuckets) b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'], lonbuckets) ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets) dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets) pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4) feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100) return transformed, feature_columns
def create_feature_columns(): # user feature phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolutionId = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] print("feature columns:", my_feature_columns) return my_feature_columns
def categorical_embedding_with_indices(feature_tensor, feature_info, file_io: FileIO): """ Converts input integer tensor into categorical embedding. Works by converting the categorical indices in the input feature_tensor, represented as integer values, into categorical embeddings based on the feature_info. Parameters ---------- feature_tensor : Tensor object int feature tensor feature_info : dict Dictionary representing the configuration parameters for the specific feature from the FeatureConfig file_io : FileIO object FileIO handler object for reading and writing Returns ------- Tensor object categorical embedding for the input feature_tensor Notes ----- Args under feature_layer_info: num_buckets : int Maximum number of categorical values default_value : int default value to be assigned to indices out of the num_buckets range embedding_size : int dimension size of the categorical embedding String based categorical features should already be converted into numeric indices """ feature_layer_info = feature_info.get("feature_layer_info") categorical_fc = feature_column.categorical_column_with_identity( CATEGORICAL_VARIABLE, num_buckets=feature_layer_info["args"]["num_buckets"], default_value=feature_layer_info["args"].get("default_value", None), ) embedding_fc = feature_column.embedding_column( categorical_fc, dimension=feature_layer_info["args"]["embedding_size"], trainable=True ) embedding = layers.DenseFeatures( embedding_fc, name="{}_embedding".format(feature_info.get("node_name", feature_info["name"])), )({CATEGORICAL_VARIABLE: feature_tensor}) embedding = tf.expand_dims(embedding, axis=1) return embedding
def categorical_column(key, vocabulary_size=None, vocabulary_list=None, vocabulary_file=None, num_oov_buckets=0): if vocabulary_size: categorical_col = feature_column.categorical_column_with_identity(key, vocabulary_size) return categorical_col elif vocabulary_list: assert isinstance(vocabulary_list[0], six.string_types), "Vocabulary must be sequence of string" categorical_col = feature_column.categorical_column_with_vocabulary_list(key, vocabulary_list, num_oov_buckets) return categorical_col elif vocabulary_file: categorical_col = feature_column.categorical_column_with_vocabulary_file(key, vocabulary_file, num_oov_buckets) return categorical_col
def test_categorical_identity_column(): # 1. Input features price = {'price': [[3], [1], [2], [0]]} # 2. Feature columns (Sparse) identity_feature_column = feature_column.categorical_column_with_identity( key='price', num_buckets=4) # 2. Feature columns (Dense) # Convert the Categorical Column to Dense Column indicator_column = feature_column.indicator_column(identity_feature_column) # 3. Feature tensor identity_feature_tensor = feature_column.input_layer( price, [indicator_column]) with tf.Session() as session: print(session.run([identity_feature_tensor]))
def create_feature_columns(): # user feature driver_age_class = fc.embedding_column( fc.categorical_column_with_identity("driver_age", num_buckets=7, default_value=0), 32) # item feature pax_age_class = fc.embedding_column(fc.categorical_column_with_identity("pax_age", num_buckets=7, default_value=0), 32) pax_des = fc.categorical_column_with_hash_bucket("des_id", 10000) pax_des_embed = fc.embedding_column(pax_des, 32) # context feature pax_price = tf.feature_column.numeric_column('price_id', default_value=0.0) pax_price_splits = tf.feature_column.bucketized_column( pax_price, boundaries=[10 * 100, 20 * 100, 30 * 100, 40 * 100, 50 * 100, 60 * 100, 70 * 100, 80 * 100, 90 * 100, 100 * 100, 110 * 100, 120 * 100]) pax_price_embed = fc.embedding_column(pax_price_splits, 32) seq_cols = ['hist_price_id', 'hist_des_id'] # hist_price_seq_embed = fc.embedding_column(fc.categorical_column_with_vocabulary_file( # key='hist_price_id', # vocabulary_file='./map.txt', # num_oov_buckets=0), 32) # hist_des_seq_embed = fc.embedding_column( # fc.categorical_column_with_vocabulary_file(key='hist_des_id', vocabulary_file='./map.txt', # default_value=0), dimension=32) hist_price_seq_embed = fc.numeric_column(key='hist_price_id', shape=(3,), default_value=[0.0] * 3, dtype=tf.float32) hist_des_seq_embed = fc.numeric_column(key='hist_des_id', shape=(3,), default_value=[0.0] * 3, dtype=tf.float32) global my_feature_columns my_feature_columns = [driver_age_class, pax_age_class, pax_des_embed, pax_price_embed, hist_price_seq_embed, hist_des_seq_embed] return my_feature_columns
def create_features_columns(self): # 向量类特征 user_vector = fc.numeric_column(key="user_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) item_vector = fc.numeric_column(key="item_vector", shape=(128, ), default_value=[0.0] * 128, dtype=tf.float32) # 分桶类特征 age = fc.numeric_column(key="age", shape=(1, ), default_value=[0], dtype=tf.int64) age = fc.bucketized_column( input_fc, boundaries=[0, 10, 20, 30, 40, 50, 60, 70, 80]) age = fc.embedding_column(age, dimension=32, combiner='mean') # 分类特征 city = fc.categorical_column_with_identity(key="city", num_buckets=1000, default_value=0) city = fc.embedding_column(city, dimension=32, combiner='mean') # hash特征 device_id = fc.categorical_column_with_hash_bucket( key="device_id", hash_bucket_size=1000000, dtype=tf.int64) device_id = fc.embedding_column(device_id, dimension=32, combiner='mean') item_id = fc.categorical_column_with_hash_bucket( key="item_id", hash_bucket_size=10000, dtype=tf.int64) item_id = fc.embedding_column(device_id, dimension=32, combiner='mean') self.user_columns["user_vector"] = user_vector self.user_columns["age"] = age self.user_columns["city"] = city self.user_columns["device_id"] = device_id self.item_columns["item_vector"] = item_vector self.item_columns["item_id"] = item_id self.feature_spec = tf.feature_column.make_parse_example_spec( self.user_columns.values() + self.item_columns.values()) return self
def create_feature_columns(): # user feature bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10240, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10240, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10240, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 1000000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") # item feature pid = fc.categorical_column_with_hash_bucket("productId", 1000000, dtype=tf.int64) sid = fc.categorical_column_with_hash_bucket("sellerId", 10240, dtype=tf.int64) bid = fc.categorical_column_with_hash_bucket("brandId", 10240, dtype=tf.int64) c1id = fc.categorical_column_with_hash_bucket("cate1Id", 100, dtype=tf.int64) cid = fc.categorical_column_with_hash_bucket("cateId", 10240, dtype=tf.int64) # context feature matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0) catePrefer = fc.numeric_column("catePrefer", default_value=0.0) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0) matchType = fc.indicator_column(fc.categorical_column_with_identity("matchType", 9, default_value=0)) postition = fc.indicator_column(fc.categorical_column_with_identity("position", 201, default_value=200)) triggerNum = fc.indicator_column(fc.categorical_column_with_identity("triggerNum", 51, default_value=50)) triggerRank = fc.indicator_column(fc.categorical_column_with_identity("triggerRank", 51, default_value=50)) sceneType = fc.indicator_column(fc.categorical_column_with_identity("type", 2, default_value=0)) hour = fc.indicator_column(fc.categorical_column_with_identity("hour", 24, default_value=0)) phoneBrand = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneBrand", 1000)) phoneResolution = fc.indicator_column(fc.categorical_column_with_hash_bucket("phoneResolution", 500)) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) tab = fc.indicator_column(fc.categorical_column_with_vocabulary_list("tab", ["ALL", "TongZhuang", "XieBao", "MuYing", "NvZhuang", "MeiZhuang", "JuJia", "MeiShi"], default_value=0)) pid_embed = fc.shared_embedding_columns([pids_weighted, pid], 64, combiner='sum', shared_embedding_collection_name="pid") bid_embed = fc.shared_embedding_columns([bids_weighted, bid], 32, combiner='sum', shared_embedding_collection_name="bid") cid_embed = fc.shared_embedding_columns([cids_weighted, cid], 32, combiner='sum', shared_embedding_collection_name="cid") c1id_embed = fc.shared_embedding_columns([c1ids_weighted, c1id], 10, combiner='sum', shared_embedding_collection_name="c1id") sid_embed = fc.shared_embedding_columns([sids_weighted, sid], 32, combiner='sum', shared_embedding_collection_name="sid") global my_feature_columns my_feature_columns = [matchScore, matchType, postition, triggerNum, triggerRank, sceneType, hour, phoneBrand, phoneResolution, phoneOs, tab, popScore, sellerPrefer, brandPrefer, cate2Prefer, catePrefer] my_feature_columns += pid_embed my_feature_columns += sid_embed my_feature_columns += bid_embed my_feature_columns += cid_embed my_feature_columns += c1id_embed print("feature columns:", my_feature_columns) return my_feature_columns
def transform(self, output_tensors): input_tensor_name = self.parameters.get("input_tensor") output_tensor_name = self.parameters.get("output_tensor") default_value = self.parameters.get("default_value", '-1') if self.parameters.has_key("bucket_size"): bucket_size = self.parameters.get("bucket_size") else: msg = "parameters error, sparse_column_with_integerized_feature must need bucket_size" logger.error(msg) raise ParametersError(msg) output_tensors[output_tensor_name] = fc.categorical_column_with_identity( key=input_tensor_name, num_buckets=bucket_size, default_value=default_value )
def create_feature_columns(): # 当我们对类目的分类数未知的时候变量离散化方法,我们通过hash的方式固定指定类目数 Brand = fc.categorical_column_with_hash_bucket("Brand", 1000) # 固定类目下的变量离散化方法 phoneOs = fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0) # 连续变量的处理 brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) # onehotencoding matchType = fc.categorical_column_with_identity("matchType", 9, default_value=0) # fc.indicator_column 可以把以上的特征dense化 return [Brand, phoneOs, brandPrefer, matchType]
def transform(inputs): transformed = inputs.copy() for feature_transform_info in FEATURE_TRANSFORM_INFO_EXECUTE_ARRAY: if feature_transform_info.op_name == TransformOp.HASH: transformed[feature_transform_info.output_name] = CategoryHash( feature_transform_info.param)( transformed[feature_transform_info.input_name]) elif feature_transform_info.op_name == TransformOp.BUCKETIZE: transformed[feature_transform_info.output_name] = NumericBucket( feature_transform_info.param)( transformed[feature_transform_info.input_name]) elif feature_transform_info.op_name == TransformOp.LOOKUP: transformed[feature_transform_info.output_name] = CategoryLookup( feature_transform_info.param)( transformed[feature_transform_info.input_name]) elif feature_transform_info.op_name == TransformOp.GROUP: group_inputs = [ transformed[name] for name in feature_transform_info.input_name ] offsets = list( itertools.accumulate([0] + feature_transform_info.param[:-1])) transformed[feature_transform_info.output_name] = Group(offsets)( group_inputs) elif feature_transform_info.op_name == TransformOp.EMBEDDING: # The num_buckets should be calcualte from the group items group_identity = fc.categorical_column_with_identity( feature_transform_info.input_name, num_buckets=feature_transform_info.param[0], ) group_embedding = fc.embedding_column( group_identity, dimension=feature_transform_info.param[1]) transformed[feature_transform_info. output_name] = tf.keras.layers.DenseFeatures( [group_embedding])({ feature_transform_info.input_name: transformed[feature_transform_info.input_name] }) elif feature_transform_info.op_name == TransformOp.ARRAY: transformed[feature_transform_info.output_name] = [ transformed[name] for name in feature_transform_info.input_name ] return tuple([transformed[name] for name in TRANSFORM_OUTPUTS])
def create_linear_feature_columns(): phoneBrand = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneResolution = fc.categorical_column_with_hash_bucket("phoneResolution", 500) phoneOs = fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0) matchScore = fc.numeric_column("matchScore", default_value=0.0) popScore = fc.numeric_column("popScore", default_value=0.0) brandPrefer = fc.numeric_column("brandPrefer", default_value=0.0, normalizer_fn=truncate) cate2Prefer = fc.numeric_column("cate2Prefer", default_value=0.0, normalizer_fn=truncate) catePrefer = fc.numeric_column("catePrefer", default_value=0.0, normalizer_fn=truncate) sellerPrefer = fc.numeric_column("sellerPrefer", default_value=0.0, normalizer_fn=truncate) matchType = fc.categorical_column_with_identity("matchType", 9, default_value=0) position = fc.categorical_column_with_identity("position", 201, default_value=200) triggerNum = fc.categorical_column_with_identity("triggerNum", 51, default_value=50) triggerRank = fc.categorical_column_with_identity("triggerRank", 51, default_value=50) sceneType = fc.categorical_column_with_identity("type", 2, default_value=0) hour = fc.categorical_column_with_identity("hour", 24, default_value=0) columns = [phoneBrand, phoneResolution, phoneOs, matchScore, popScore, brandPrefer, cate2Prefer, catePrefer, sellerPrefer, matchType, position, triggerRank, triggerNum, sceneType, hour] print("linear feature columns:", columns) return columns
# numeric cols for header in ['age']: feature_columns.append(feature_column.numeric_column(header)) age = feature_column.numeric_column("age") # bucketized cols age_buckets = feature_column.bucketized_column( age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65]) #feature_columns.append(age_buckets) #for header in ['KreditFF','Insolvent','Reglerad','man']: #categorical_column =feature_column.categorical_column_with_identity(key=header, num_buckets=100, default_value=0) #categorical_columns.append(feature_column.categorical_column_with_identity(key=header, num_buckets=100, default_value=0)) categorical_column_1 = feature_column.categorical_column_with_identity( key='KreditFF360', num_buckets=100, default_value=0) categorical_column_2 = feature_column.categorical_column_with_identity( key='A_Insolvent360', num_buckets=100, default_value=0) categorical_column_3 = feature_column.categorical_column_with_identity( key='A_Utslag360', num_buckets=100, default_value=0) categorical_column_4 = feature_column.categorical_column_with_identity( key='man', num_buckets=2, default_value=0) feature_columns = [ tf.feature_column.indicator_column(categorical_column_1), tf.feature_column.indicator_column(categorical_column_2), tf.feature_column.indicator_column(categorical_column_3) #tf.feature_column.indicator_column(categorical_column_m) ] #feature_columns.append(age_buckets)
sess.run(tf.tables_initializer()) print(sess.run([color_embeding_dense_tensor])) """--------------------------------------------- bucketized_column -------------------------------------------------""" """bucketized_column""" with tf.Session() as session: price = {'price': [[5.], [15.], [25.], [35.]]} price_column = feature_column.numeric_column('price') bucket_price = feature_column.bucketized_column(price_column, [10, 20, 30, 40]) price_bucket_tensor = feature_column.input_layer(price, [bucket_price]) print(session.run([price_bucket_tensor])) """--------------------------------------------- Categorical_column -------------------------------------------------""" """categorical_column_with_identity""" with tf.Session() as sess: color_data = {'color': [[2], [5], [-1], [0]]} color_column = feature_column.categorical_column_with_identity('color', 7) color_column_identy = feature_column.indicator_column(color_column) color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) sess.run(tf.global_variables_initializer()) sess.run(tf.tables_initializer()) print(sess.run([color_dense_tensor])) """categorical_column_with_vocabulary_list""" with tf.Session() as sess: color_data = {'color': [['R', 'R'], ['G', 'R'], ['B', 'G'], ['A', 'A']]} color_column = feature_column.categorical_column_with_vocabulary_list( 'color', ['R', 'G', 'B'], dtype=tf.string, default_value=-1) color_column_identy = feature_column.indicator_column(color_column) color_dense_tensor = feature_column.input_layer(color_data, [color_column_identy]) sess.run(tf.global_variables_initializer())
def main(args): in_csv_path = Path(args.in_csv) y_name = args.y_var utils.setupLogFile(in_csv_path.parent) logging.info(' --- RUN for outvar {}, target {} ----- '.format( args.outvar, y_name)) if not in_csv_path.exists(): logging.error('Could not find the input file') try: dataframe = read_csv(str(in_csv_path)) for column_name in dataframe.columns: if column_name.startswith('_'): dataframe.pop(column_name) for header in [ 'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb', 'mom_height_in' ]: r = max(dataframe[header]) - min(dataframe[header]) dataframe[header] = (dataframe[header] - min(dataframe[header])) / r train_whole = dataframe[(dataframe[y_name] != '.') & (notna(dataframe[y_name])) & (notnull(dataframe[y_name]))].copy() train_whole = train_whole.astype({y_name: 'int32'}) logging.info( ' Number of trainig samples in the selected set: {}'.format( len(train_whole))) batch_size = 32 # A small batch sized is used for demonstration purposes feature_columns = [] feature_names = [] for header in [ 'fl_1', 'bp_1', 'hc_1', 'ac_1', 'mom_age_edd', 'mom_weight_lb', 'mom_height_in' ]: feature_columns.append(feature_column.numeric_column(header)) feature_names.append(header) for header in [ 'hiv', 'current_smoker', 'former_smoker', 'chronic_htn', 'preg_induced_htn', 'diabetes', 'gest_diabetes' ]: col = feature_column.categorical_column_with_identity(header, 2) col = feature_column.indicator_column(col) feature_columns.append(col) feature_names.append(header) all_pred = [0] * len(dataframe) print('****** args.all_train is: {}'.format(args.all_train)) if not args.all_train: trimester = train_whole['trimester'].values.tolist() min_trim = min(trimester) max_trim = max(trimester) model_trim = None model_2trim = None model_3trim = None if min_trim == max_trim: # This data is only for one of the trimesters, run the training for one of them. logging.info( 'Training only for Trimester regression: {}'.format( min_trim + 2)) if min_trim == 0: model_2trim = train_trimester_2(train_whole, feature_columns, batch_size, y_name) if model_2trim is None: raise Exception('2nd trimester model empty') else: model_3trim = train_trimester_3(train_whole, feature_columns, batch_size, y_name) if model_3trim is None: raise Exception('3rd trimester model empty') else: model_trim = train_trimester(train_whole, feature_columns, batch_size, 'trimester') trim_2_df = train_whole[train_whole['trimester'] == 0] model_2trim = train_trimester_2(trim_2_df, feature_columns, batch_size, y_name) trim_3_df = train_whole[train_whole['trimester'] == 1] model_3trim = train_trimester_3(trim_3_df, feature_columns, batch_size, y_name) logging.info('-- done training for all three ') if model_trim is None or model_2trim is None and model_3trim is None: raise Exception( 'One of the models came back empty during the classification/regression phase' ) # Classify the dataset if this is a multi-trimester dataset if model_trim is not None and model_2trim is not None and model_3trim is not None: logging.info('Creating predictions for the full dataset') ds = df_to_dataset(dataframe, shuffle=False, batch_size=32, labels_name=y_name) ga_2trim = model_2trim.predict(ds) ga_3trim = model_3trim.predict(ds) ds = df_to_dataset(dataframe, shuffle=False, batch_size=32, labels_name='trimester') c_p = (model_trim.predict(ds) > 0).astype("int32") all_pred = [ g_2[0] if c == 0 else g_3[0] for (g_2, g_3, c) in zip(ga_2trim, ga_3trim, c_p) ] logging.info('Length of all predictions list is: {}'.format( len(all_pred))) elif min_trim == max_trim: ds = df_to_dataset(dataframe, shuffle=False, batch_size=32, labels_name=y_name) if min_trim == 0 and model_2trim is not None: all_pred = model_2trim.predict(ds) elif min_trim == 1 and model_3trim is not None: all_pred = model_3trim.predict(ds) else: logging.error('Either 2nd or 3rd trimester data is null') else: logging.error('We are in unknown territory, exiting') else: # Per trimester if/else model_g = train_general(train_whole, feature_columns, batch_size, y_name) ds = df_to_dataset(dataframe, shuffle=False, batch_size=32, labels_name=y_name) all_pred = model_g.predict(ds) logging.info('Creating output dataset') out_df = dataframe[['PatientID', 'filename', 'studydate']].copy() out_df[args.outvar] = all_pred out_path = in_csv_path.parent / (args.outvar + '.csv') logging.info('Should output to: {}'.format(out_path)) out_df.to_csv(out_path) except Exception as e: logging.error('Error: \n{}'.format(e)) logging.error(e)
def categorical_indicator_with_vocabulary_file(feature_tensor, feature_info, file_io: FileIO): """ Converts a string tensor into a categorical one-hot representation. Works by using a vocabulary file to convert the string tensor into categorical indices and then converting the categories into one-hot representation. Args: feature_tensor: String feature tensor feature_info: Dictionary representing the configuration parameters for the specific feature from the FeatureConfig Returns: Categorical one-hot representation of input feature_tensor Args under feature_layer_info: vocabulary_file: string; path to vocabulary CSV file for the input tensor containing the vocabulary to look-up. uses the "key" named column as vocabulary of the 1st column if no "key" column present. max_length: int; max number of rows to consider from the vocabulary file. if null, considers the entire file vocabulary. num_oov_buckets: int - optional; number of out of vocabulary buckets/slots to be used to encode strings into categorical indices. If not specified, the default is 1. NOTE: The vocabulary CSV file must contain two columns - key, id, where the key is mapped to one id thereby resulting in a many-to-one vocabulary mapping. If id field is absent, a unique whole number id is assigned by default resulting in a one-to-one mapping """ # ########################################################################## # # NOTE: # Current bug[1] with saving a Keras model when using # feature_column.categorical_column_with_vocabulary_list. # Tracking the issue currently and should be able to upgrade # to current latest stable release 2.2.0 to test. # # Can not use TF2.1.0 due to issue[2] regarding saving Keras models with # custom loss, metric layers # # Can not use TF2.2.0 due to issues[3, 4] regarding incompatibility of # Keras Functional API models and Tensorflow # # References: # [1] https://github.com/tensorflow/tensorflow/issues/31686 # [2] https://github.com/tensorflow/tensorflow/issues/36954 # [3] https://github.com/tensorflow/probability/issues/519 # [4] https://github.com/tensorflow/tensorflow/issues/35138 # # CATEGORICAL_VARIABLE = "categorical_variable" # categorical_fc = feature_column.categorical_column_with_vocabulary_list( # CATEGORICAL_VARIABLE, # vocabulary_list=vocabulary_list, # default_value=feature_layer_info["args"].get("default_value", -1), # num_oov_buckets=feature_layer_info["args"].get("num_oov_buckets", 0), # ) # # indicator_fc = feature_column.indicator_column(categorical_fc) # # categorical_one_hot = layers.DenseFeatures( # indicator_fc, # name="{}_one_hot".format(feature_info.get("node_name", feature_info["name"])), # )({CATEGORICAL_VARIABLE: feature_tensor}) # categorical_one_hot = tf.expand_dims(categorical_one_hot, axis=1) # ########################################################################## # feature_tensor_indices, vocabulary_keys, num_oov_buckets = categorical_indices_from_vocabulary_file( feature_info, feature_tensor, file_io) vocabulary_size = len(set(vocabulary_keys)) categorical_identity_fc = feature_column.categorical_column_with_identity( CATEGORICAL_VARIABLE, num_buckets=vocabulary_size + num_oov_buckets) indicator_fc = feature_column.indicator_column(categorical_identity_fc) categorical_one_hot = layers.DenseFeatures( indicator_fc, name="{}_one_hot".format( feature_info.get("node_name", feature_info["name"])), )({ CATEGORICAL_VARIABLE: feature_tensor_indices }) categorical_one_hot = tf.expand_dims(categorical_one_hot, axis=1) return categorical_one_hot
boundaries=[0, 3, 5, 7, 9]) feature_columns.append(tenure_buckets) # indicator cols geography = feature_column.categorical_column_with_vocabulary_list( 'Geography', ['France', 'Spain', 'Germany']) geography_one_hot = feature_column.indicator_column(geography) gender = feature_column.categorical_column_with_vocabulary_list( 'Gender', ['Female', 'Male']) gender_one_hot = feature_column.indicator_column(gender) feature_columns.append(geography_one_hot) feature_columns.append(gender_one_hot) for header in ['HasCrCard', 'IsActiveMember']: col = feature_column.categorical_column_with_identity(key=header, num_buckets=2) col_one_hot = feature_column.indicator_column(col) feature_columns.append(col_one_hot) # embedding cols # hashed feature cols # crossed cols # Step 7 : Create a feature layer feature_layer = tf.keras.layers.DenseFeatures(feature_columns) train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) # Step 8 : Create, compile, and train the model
def transform_from_code_gen(source_inputs): inputs = source_inputs.copy() education_hash_out = CategoryHash(education_hash.param)( inputs["education"]) occupation_hash_out = CategoryHash(occupation_hash.param)( inputs["occupation"]) native_country_hash_out = CategoryHash(native_country_hash.param)( inputs["native_country"]) workclass_lookup_out = CategoryLookup(workclass_lookup.param)( inputs["workclass"]) marital_status_lookup_out = CategoryLookup(marital_status_lookup.param)( inputs["marital_status"]) relationship_lookup_out = CategoryLookup(relationship_lookup.param)( inputs["relationship"]) race_lookup_out = CategoryLookup(race_lookup.param)(inputs["race"]) sex_lookup_out = CategoryLookup(sex_lookup.param)(inputs["sex"]) age_bucketize_out = NumericBucket(age_bucketize.param)(inputs["age"]) capital_gain_bucketize_out = NumericBucket(capital_gain_bucketize.param)( inputs["capital_gain"]) capital_loss_bucketize_out = NumericBucket(capital_loss_bucketize.param)( inputs["capital_loss"]) hours_per_week_bucketize_out = NumericBucket( hours_per_week_bucketize.param)(inputs["hours_per_week"]) group1_out = Group(group1.param)([ workclass_lookup_out, hours_per_week_bucketize_out, capital_gain_bucketize_out, capital_loss_bucketize_out, ]) group2_out = Group(group2.param)([ education_hash_out, marital_status_lookup_out, relationship_lookup_out, occupation_hash_out, ]) group3_out = Group(group3.param)([ age_bucketize_out, sex_lookup_out, race_lookup_out, native_country_hash_out, ]) group1_wide_embedding_column = fc.embedding_column( fc.categorical_column_with_identity( "group1", num_buckets=group1_embedding_wide.param[0]), dimension=group1_embedding_wide.param[1], ) group1_embedding_wide_out = tf.keras.layers.DenseFeatures( [group1_wide_embedding_column])({ "group1": group1_out }) group2_wide_embedding_column = fc.embedding_column( fc.categorical_column_with_identity( "group2", num_buckets=group2_embedding_wide.param[0]), dimension=group2_embedding_wide.param[1], ) group2_embedding_wide_out = tf.keras.layers.DenseFeatures( [group2_wide_embedding_column])({ "group2": group2_out }) group1_deep_embedding_column = fc.embedding_column( fc.categorical_column_with_identity( "group1", num_buckets=group1_embedding_deep.param[0]), dimension=group1_embedding_deep.param[1], ) group1_embedding_deep_out = tf.keras.layers.DenseFeatures( [group1_deep_embedding_column])({ "group1": group1_out }) group2_deep_embedding_column = fc.embedding_column( fc.categorical_column_with_identity( "group2", num_buckets=group2_embedding_deep.param[0]), dimension=group2_embedding_deep.param[1], ) group2_embedding_deep_out = tf.keras.layers.DenseFeatures( [group2_deep_embedding_column])({ "group2": group2_out }) group3_deep_embedding_column = fc.embedding_column( fc.categorical_column_with_identity( "group3", num_buckets=group3_embedding_deep.param[0]), dimension=group3_embedding_deep.param[1], ) group3_embedding_deep_out = tf.keras.layers.DenseFeatures( [group3_deep_embedding_column])({ "group3": group3_out }) wide_embeddings_out = [ group1_embedding_wide_out, group2_embedding_wide_out, ] deep_embeddings_out = [ group1_embedding_deep_out, group2_embedding_deep_out, group3_embedding_deep_out, ] return wide_embeddings_out, deep_embeddings_out
def create_feature_columns(): # user feature bids = fc.categorical_column_with_hash_bucket("behaviorBids", 10000, dtype=tf.int64) c1ids = fc.categorical_column_with_hash_bucket("behaviorC1ids", 100, dtype=tf.int64) cids = fc.categorical_column_with_hash_bucket("behaviorCids", 10000, dtype=tf.int64) sids = fc.categorical_column_with_hash_bucket("behaviorSids", 10000, dtype=tf.int64) pids = fc.categorical_column_with_hash_bucket("behaviorPids", 500000, dtype=tf.int64) bids_weighted = fc.weighted_categorical_column(bids, "bidWeights") c1ids_weighted = fc.weighted_categorical_column(c1ids, "c1idWeights") cids_weighted = fc.weighted_categorical_column(cids, "cidWeights") sids_weighted = fc.weighted_categorical_column(sids, "sidWeights") pids_weighted = fc.weighted_categorical_column(pids, "pidWeights") pid_embed = fc.embedding_column(pids_weighted, 64) bid_embed = fc.embedding_column(bids_weighted, 32) cid_embed = fc.embedding_column(cids_weighted, 48) c1id_embed = fc.embedding_column(c1ids_weighted, 10) sid_embed = fc.embedding_column(sids_weighted, 32) phoneBrandId = fc.categorical_column_with_hash_bucket("phoneBrand", 1000) phoneBrand = fc.embedding_column(phoneBrandId, 20) phoneResolutionId = fc.categorical_column_with_hash_bucket( "phoneResolution", 500) phoneResolution = fc.embedding_column(phoneResolutionId, 10) phoneOs = fc.indicator_column( fc.categorical_column_with_vocabulary_list("phoneOs", ["android", "ios"], default_value=0)) gender = fc.indicator_column( fc.categorical_column_with_identity("gender", num_buckets=3, default_value=0)) age_class = fc.indicator_column( fc.categorical_column_with_identity("age_class", num_buckets=7, default_value=0)) has_baby = fc.indicator_column( fc.categorical_column_with_identity("has_baby", num_buckets=2, default_value=0)) baby_gender = fc.indicator_column( fc.categorical_column_with_identity("baby_gender", num_buckets=3, default_value=0)) baby_age = fc.indicator_column( fc.categorical_column_with_identity("baby_age", num_buckets=7, default_value=0)) grade = fc.indicator_column( fc.categorical_column_with_identity("grade", num_buckets=7, default_value=0)) rfm_type = fc.indicator_column( fc.categorical_column_with_identity("bi_rfm_type", num_buckets=12, default_value=0)) city_id = fc.categorical_column_with_hash_bucket("city", 700) city = fc.embedding_column(city_id, 16) userType = fc.indicator_column( fc.categorical_column_with_identity("user_type", 6, default_value=0)) hour = fc.indicator_column( fc.categorical_column_with_identity("hour", 24, default_value=0)) global my_feature_columns my_feature_columns = [ userType, hour, gender, age_class, has_baby, baby_gender, baby_age, grade, rfm_type, phoneBrand, phoneResolution, phoneOs, pid_embed, sid_embed, bid_embed, cid_embed, c1id_embed, city ] print("feature columns:", my_feature_columns) return my_feature_columns
def run_exp(params): exp.tag(params) URL = 'mushroom/all.csv' dataframe = pd.read_csv(URL) dataframe.head() specs, target = df_column_specs(dataframe, params=None) train, test = train_test_split(dataframe, test_size=0.2) train, val = train_test_split(train, test_size=0.2) print(len(train), 'train examples') print(len(val), 'validation examples') print(len(test), 'test examples') batch_size = params.batch or 32 def df_to_dataset(dataframe, shuffle=True, batch_size=batch_size): dataframe = dataframe.copy() labels = dataframe.pop('target') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)) ds = ds.batch(batch_size) return ds # A utility method to create a tf.data dataset from a Pandas Dataframe feature_columns = [] # for mushroom we know the cols are all categorical, so we're not being too # careful here for col in specs: feature_columns.append( feature_column.indicator_column( feature_column.categorical_column_with_identity( col['name'], col['card']))) feature_layer = tf.keras.layers.DenseFeatures(feature_columns) train_ds = df_to_dataset(train, batch_size=batch_size) val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size) # ## Create Model # # In[20]: featurizer = tf.keras.Sequential([feature_layer], name='featurizer') l1 = params.l1 or 0.0 inner_model_logits = tf.keras.Sequential([ layers.Dense(1, kernel_regularizer=tf.keras.regularizers.l1(l=l1), kernel_initializer=tf.random_normal_initializer(), bias_initializer=tf.random_normal_initializer()) ], name='inner_model_logits') logit_prob = tf.keras.Sequential([layers.Activation('sigmoid')], name='logit_prob') features_prob = tf.keras.Sequential([inner_model_logits, logit_prob]) # try out the model example_batch, label_batch = next(iter(train_ds)) features = featurizer(example_batch) logits = inner_model_logits(features) logit_prob(logits)[:4] features_prob(features[:4]) # predictions = model(example_batch)[:,0] loss_object = tf.keras.losses.BinaryCrossentropy() if params.adam: optimizer = tf.keras.optimizers.Adam(learning_rate=0.01) else: optimizer = tf.keras.optimizers.Ftrl(learning_rate=0.01) # features_prob.compile(optimizer=optimizer, loss=loss_object) def inner_model_logits_2_class(x): logits = inner_model_logits(x) return tf.concat([-logits, logits], axis=1) features_perturbed = projected_gradient_descent(inner_model_logits_2_class, features, 0.4, 0.2, 5, np.inf, y=label_batch, targeted=False) input_dim = features_prob.layers[0].inputs[0].shape[1] inputs = tf.keras.Input(shape=(input_dim, )) outputs = tf.keras.layers.Dense( 1, activation='sigmoid', kernel_regularizer=tf.keras.regularizers.l1(l=l1), kernel_initializer=tf.random_normal_initializer(), bias_initializer=tf.random_normal_initializer())(inputs) features_prob_functional = tf.keras.Model(inputs=inputs, outputs=outputs) model = tf.keras.Sequential([ featurizer, features_prob_functional # inner_model_logits, # logit_prob ]) model(example_batch)[:4] def inner_model_probs_2_class(x): probs = features_prob_functional(x) return tf.concat([1 - probs, probs], axis=1) def mdl_to_2_class(mdl): def fn(x): probs = mdl(x) return tf.concat([1 - probs, probs], axis=1) return fn # test out adv perturbation, and closed form # features_perturbed_closed_form = adversarial.logistic_perturb( # inner_model_logits, features, label_batch, 0.4) # # [features_perturbed_closed_form[0, :5], features_perturbed[0, :5]] train_loss = tf.keras.metrics.Mean(name='train_loss') train_auc = tf.keras.metrics.AUC(name='train_auc') train_acc = tf.keras.metrics.BinaryAccuracy(name='train_acc') test_loss = tf.keras.metrics.Mean(name='test_loss') test_auc = tf.keras.metrics.AUC(name='test_auc') test_acc = tf.keras.metrics.BinaryAccuracy(name='test_acc') # In[26]: #@tf.function def train_step_perturb_input(examples, labels, eps, eps_step, num_pgd_steps): features = featurizer(examples) features_prob_functional(features) # dummy call to force weights to be # created # this is done outside the tape, but it's ok because we've ensured # there are no trainable vars in the featurizer (no embeddings!) features_perturbed = adversarial.logistic_perturb( features_prob_functional, features, labels, eps) with tf.GradientTape() as tape: predictions = features_prob_functional(features_perturbed) loss = loss_object(labels, predictions[:, 0]) loss += sum(features_prob_functional.losses) vars = featurizer.trainable_variables + \ features_prob_functional.trainable_variables gradients = tape.gradient(loss, vars) optimizer.apply_gradients(zip(gradients, vars)) train_loss(loss) train_auc(labels, tf.reshape(predictions, [-1])) train_acc(labels, tf.reshape(predictions, [-1])) def train_step_perturb_last_layer(examples, labels, eps, freeze_initial_layers=True): with tf.GradientTape() as tape: features = featurizer(examples) left, right = model_metrics.sub_models(inner_model_logits) penultimate_activations = left(features) right(penultimate_activations ) # dummy call to force weights to be created d = penultimate_activations.shape[1] activations_perturbed = adversarial.logistic_perturb( right, penultimate_activations, labels, eps / math.sqrt(d)) logits_perturbed = right(activations_perturbed) predictions = logit_prob(logits_perturbed) loss = loss_object(labels, predictions[:, 0]) loss += sum(features_prob.losses) if freeze_initial_layers: mdl = inner_model_logits.layers[1] else: mdl = inner_model_logits vars = mdl.trainable_variables # + featurizer.trainable_variables gradients = tape.gradient(loss, vars) optimizer.apply_gradients(zip(gradients, vars)) train_loss(loss) train_auc(labels, tf.reshape(predictions, [-1])) # In[27]: #@tf.function def test_step(examples, labels): predictions = model(examples)[:, 0] t_loss = loss_object(labels, predictions) test_loss(t_loss) test_auc(labels, predictions) test_acc(labels, predictions) # In[28]: # example_batch, label_batch = next(iter(train_ds)) # predictions = model(example_batch)[:,0] # loss_object(label_batch, predictions) # In[29]: def get_all_x_y(ds): batches = [] y = [] for examples, labels in ds: x = featurizer(examples) batches = batches + [x] y = y + [labels] return tf.concat(batches, axis=0), tf.concat(y, axis=0) def reset_states(): train_loss.reset_states() train_auc.reset_states() train_acc.reset_states() test_loss.reset_states() test_auc.reset_states() test_acc.reset_states() EPOCHS = params.epochs or 20 if args.quick: EPOCHS = 2 eps = params.eps or 0.0 # 1.3 ? num_pgd_steps = params.pgd_steps or 5 perturb_input = params.perturb_input clean_epochs = params.clean_epochs or 2 if args.quick: clean_epochs = 1 eps_step_factor = params.eps_step_factor or 5.0 for epoch in range(EPOCHS): reset_states() if epoch < clean_epochs: eps_val = 0 else: eps_val = eps eps_step = eps_val / eps_step_factor for images, labels in train_ds: if perturb_input: train_step_perturb_input(images, labels, eps_val, eps_step, num_pgd_steps) else: # perturb penultimate layer, using closed form for binary prob output train_step_perturb_last_layer(images, labels, eps_val, freeze_initial_layers=True) for test_images, test_labels in test_ds: test_step(test_images, test_labels) template = 'Eps={}, Epoch {}, Loss: {}, AUC: {}, ' \ 'Test Loss: {}, Test AUC: {}, Test AC: {}' if epoch % 5 == 0: print( template.format(eps_val, epoch + 1, train_loss.result(), train_auc.result() * 100, test_loss.result(), test_auc.result() * 100, test_acc.result() * 100)) # dummy examples to force model shapes to be set example_batch, label_batch = next(iter(train_ds)) model(example_batch) model_file = 'mushroom_models/' + f'eps={eps}' #model.save(model_file, overwrite=True) model.save_weights(model_file + ".ckpt", overwrite=True) m = model_metrics.weight_metrics(model) print(f'weight metrics with eps={eps}:') print(m) x_test, y_test = get_all_x_y(test_ds) attribution_method = 'shap' if params.attribution_shap else 'ig' mdl_shap = features_prob_functional mdl_ig = inner_model_probs_2_class av_ig_pct1pct, ig_ent, av_gini, ig_results = \ model_metrics.attribs_pct( mdl_shap, mdl_ig, x_test, y_test, attribution_method=attribution_method) print(f'eps={eps}: IG_pct1pct={av_ig_pct1pct}') print(f'****done with eps={eps} ****') print(f'****************************') params_and_results = params.mod( dict(test_auc=np.round(test_auc.result() * 100, 2), test_acc=np.round(test_acc.result() * 100, 2), ig_ent=np.round(ig_ent, 2), gini=np.round(av_gini, 3), ig_1p=np.round(av_ig_pct1pct[0], 2))) print('*** logging ****') print(params_and_results.dict()) exp.log(params_and_results.dict()) exp.save() exp.close() return ig_results
def main(): training_set = tf.contrib.learn.datasets.base.load_csv_without_header( filename='trainingData.txt', target_dtype=np.int, features_dtype=np.int, target_column=0) test_set = tf.contrib.learn.datasets.base.load_csv_without_header( filename='testData.txt', target_dtype=np.int, features_dtype=np.int, target_column=0) def input_fn_train(): # method used to deliver the training data x = { "blinkyE":tf.convert_to_tensor(training_set.data), "inkyE": tf.convert_to_tensor(training_set.data), "pinkyE": tf.convert_to_tensor(training_set.data), "sueE": tf.convert_to_tensor(training_set.data), "blinkyDist": tf.convert_to_tensor(training_set.data), "inkyDist": tf.convert_to_tensor(training_set.data), "pinkyDist": tf.convert_to_tensor(training_set.data), "sueDist": tf.convert_to_tensor(training_set.data) } y = tf.convert_to_tensor(training_set.target) return x, y def input_fn_test(): x = { "blinkyE": tf.convert_to_tensor(test_set.data), "inkyE": tf.convert_to_tensor(test_set.data), "pinkyE": tf.convert_to_tensor(test_set.data), "sueE": tf.convert_to_tensor(test_set.data), "blinkyDist": tf.convert_to_tensor(test_set.data), "inkyDist": tf.convert_to_tensor(test_set.data), "pinkyDist": tf.convert_to_tensor(test_set.data), "sueDist": tf.convert_to_tensor(test_set.data) } y = tf.convert_to_tensor(test_set.target) return x,y #Describe the feature columns a1 =fc.embedding_column(fc.categorical_column_with_identity(key="blinkyE",num_buckets=2,default_value=0),dimension=1) b1 =fc.embedding_column(fc.categorical_column_with_identity(key="inkyE",num_buckets=2,default_value=0),1) c1 =fc.embedding_column(fc.categorical_column_with_identity(key="pinkyE",num_buckets=2,default_value=0),1) d1 =fc.embedding_column(fc.categorical_column_with_identity(key="sueE",num_buckets=2,default_value=0),1) e1 =fc.embedding_column(fc.categorical_column_with_identity(key="blinkyDist",num_buckets=5,default_value=0),1) f1 =fc.embedding_column(fc.categorical_column_with_identity(key="inkyDist",num_buckets=5,default_value=0),1) g1 =fc.embedding_column(fc.categorical_column_with_identity(key="pinkyDist",num_buckets=5,default_value=0),1) h1 =fc.embedding_column(fc.categorical_column_with_identity(key="sueDist",num_buckets=5,default_value=0),1) categorical_columns = set([a1,b1,c1,d1,e1,f1,g1,h1]) estimator = tf.estimator.DNNClassifier(feature_columns=categorical_columns, hidden_units=[24,50,24], n_classes=6, model_dir="/tmp/a8_model" ) #Fit model print("Train: ") estimator.train(input_fn=input_fn_train,max_steps=20000) print("Fitted! : ") print("Evaluate using a test set: ") evaluate = estimator.evaluate(input_fn=input_fn_test,steps=1) print(evaluate) sess = tf.Session() sess.run(tf.global_variables_initializer()) # save the model feature_spec = tf.feature_column.make_parse_example_spec(categorical_columns) tfrecord_serving_input_fn = tf.estimator.export.build_parsing_serving_input_receiver_fn(feature_spec) path=estimator.export_savedmodel(export_dir_base="/tmp/a8_model/saved", serving_input_receiver_fn = tfrecord_serving_input_fn, as_text=True) print(path)
def _get_category_column_from_dict(name, kwargs): """生成对应的category feature column. :param name: 列名 :param kwargs: 下面四个参数互斥: * num_buckets: int, 最大数值 * hash_buckets: int, hash桶数 * vocab_list: list, 候选词列表 * vocab_file: str, 候选词文件 :return: feature_column """ convert_methods = { "num_buckets", "hash_buckets", "vocab_list", "vocab_file" } convert_method = set(kwargs.keys()).intersection(convert_methods) # 四个参数互斥 if len(convert_method) > 1: raise ValueError("{}: {} cannot coexist." "Please leave only one".format(name, convert_method)) elif len(convert_method) < 1: raise ValueError("{}: argument missing," "one of {} should be given".format( name, convert_methods)) else: convert_method = list(convert_method)[0] if convert_method == "num_buckets": # TODO: 检查dtype == int num_buckets = kwargs.pop(convert_method) assert num_buckets > 0 # TODO: 如果能用-1更好 # missing_value用最大值填充 default_value = num_buckets num_buckets += 1 return fc.categorical_column_with_identity(name, num_buckets, default_value) elif convert_method == "hash_buckets": hash_bucket_size = kwargs.pop(convert_method) assert hash_bucket_size > 0 # TODO: 支持dtype传入 return fc.categorical_column_with_hash_bucket(name, hash_bucket_size) elif convert_method == "vocab_list": vocabulary_list = kwargs.pop(convert_method) if isinstance(vocabulary_list, str): # TODO: 词表支持不同的分隔符 vocabulary_list = [x.strip() for x in vocabulary_list.split(",")] assert len(vocabulary_list) > 1, ("{}: list's length > 0," "while get: {}".format( name, len(vocabulary_list))) # TODO: 支持dtype, default_value return fc.categorical_column_with_vocabulary_list( name, vocabulary_list) elif convert_method == "vocab_file": # TODO: 支持文件 raise NotImplementedError("Support later.") else: raise ValueError("{}: {} is not supported".format( name, convert_method))