Esempio n. 1
0
def transform(inputs, NUMERIC_COLS, STRING_COLS, nbuckets):
    # Pass-through columns
    transformed = inputs.copy()
    del transformed['pickup_datetime']

    feature_columns = {
        colname: fc.numeric_column(colname)
        for colname in NUMERIC_COLS
    }

    # Scaling longitude from range [-70, -78] to [0, 1]
    for lon_col in ['pickup_longitude', 'dropoff_longitude']:
        transformed[lon_col] = layers.Lambda(lambda x: (x + 78) / 8.0,
                                             name='scale_{}'.format(lon_col))(
                                                 inputs[lon_col])

    # Scaling latitude from range [37, 45] to [0, 1]
    for lat_col in ['pickup_latitude', 'dropoff_latitude']:
        transformed[lat_col] = layers.Lambda(lambda x: (x - 37) / 8.0,
                                             name='scale_{}'.format(lat_col))(
                                                 inputs[lat_col])

    # Adding Euclidean dist (no need to be accurate: NN will calibrate it)
    transformed['euclidean'] = layers.Lambda(euclidean, name='euclidean')([
        inputs['pickup_longitude'], inputs['pickup_latitude'],
        inputs['dropoff_longitude'], inputs['dropoff_latitude']
    ])
    feature_columns['euclidean'] = fc.numeric_column('euclidean')

    # hour of day from timestamp of form '2010-02-08 09:17:00+00:00'
    transformed['hourofday'] = layers.Lambda(
        lambda x: tf.strings.to_number(tf.strings.substr(x, 11, 2),
                                       out_type=tf.dtypes.int32),
        name='hourofday')(inputs['pickup_datetime'])
    feature_columns['hourofday'] = fc.indicator_column(
        fc.categorical_column_with_identity('hourofday', num_buckets=24))

    latbuckets = np.linspace(0, 1, nbuckets).tolist()
    lonbuckets = np.linspace(0, 1, nbuckets).tolist()
    b_plat = fc.bucketized_column(feature_columns['pickup_latitude'],
                                  latbuckets)
    b_dlat = fc.bucketized_column(feature_columns['dropoff_latitude'],
                                  latbuckets)
    b_plon = fc.bucketized_column(feature_columns['pickup_longitude'],
                                  lonbuckets)
    b_dlon = fc.bucketized_column(feature_columns['dropoff_longitude'],
                                  lonbuckets)
    ploc = fc.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = fc.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = fc.crossed_column([ploc, dloc], nbuckets**4)
    feature_columns['pickup_and_dropoff'] = fc.embedding_column(pd_pair, 100)

    return transformed, feature_columns
Esempio n. 2
0
def build_features(statistics):
    pu_location_id = fc.categorical_column_with_identity(key='PULocationID',
                                                         num_buckets=265)
    do_location_id = fc.categorical_column_with_identity(key='DOLocationID',
                                                         num_buckets=265)
    day_of_week = fc.categorical_column_with_identity(key='day_of_week',
                                                      num_buckets=7)
    weekend = fc.categorical_column_with_identity(key='weekend', num_buckets=2)
    speed_buckets = fc.bucketized_column(
        fc.numeric_column('speed'), boundaries=[10, 20, 30, 40, 50, 60, 70])
    distance_buckets = fc.bucketized_column(
        fc.numeric_column('trip_distance'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    duration_buckets = fc.bucketized_column(
        fc.numeric_column('duration'),
        boundaries=[500, 1000, 1500, 2000, 2500, 3000, 3500, 4000, 4500])
    fare_buckets = fc.bucketized_column(
        fc.numeric_column('fare_amount'),
        boundaries=[2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30])
    passenger_buckets = fc.bucketized_column(
        fc.numeric_column('passenger_count'), boundaries=[1, 3, 5, 7, 9])
    location = fc.crossed_column([pu_location_id, do_location_id],
                                 hash_bucket_size=1000)
    cross_all = fc.crossed_column([
        location, speed_buckets, distance_buckets, duration_buckets,
        fare_buckets, passenger_buckets
    ],
                                  hash_bucket_size=1000)
    categorical_columns = [
        fc.embedding_column(pu_location_id, dimension=32),
        fc.embedding_column(do_location_id, dimension=32),
        fc.indicator_column(day_of_week),
        fc.indicator_column(weekend)
    ]
    numeric_columns = [
        custom_numeric_column('passenger_count', statistics),
        custom_numeric_column('trip_distance', statistics),
        custom_numeric_column('fare_amount', statistics),
        custom_numeric_column('extra', statistics),
        custom_numeric_column('mta_tax', statistics),
        custom_numeric_column('tolls_amount', statistics),
        custom_numeric_column('improvement_surcharge', statistics),
        custom_numeric_column('duration', statistics),
        custom_numeric_column('speed', statistics)
    ]
    dnn_feature_columns = numeric_columns + categorical_columns
    linear_feature_columns = [location, cross_all]
    return dnn_feature_columns, linear_feature_columns
def test_crossed_column():
    """ crossed column测试 """
    #源数据
    featrues = {
        'price': [['A'], ['B'], ['C']],  # 0,1,2
        'color': [['R'], ['G'], ['B']]  # 0,1,2
    }
    # categorical_column
    price = feature_column.categorical_column_with_vocabulary_list(
        'price', ['A', 'B', 'C', 'D'])
    color = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'])

    #crossed_column 产生稀疏表示
    p_x_c = feature_column.crossed_column([price, color], 16)

    # 稠密表示
    p_x_c_identy = feature_column.indicator_column(p_x_c)

    # crossed_column 连接 源数据
    p_x_c_identy_dense_tensor = feature_column.input_layer(
        featrues, [p_x_c_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([p_x_c_identy_dense_tensor]))
    def _make_crossed(self):
        """Makes crossed features for both Wide or Deep network.

    Returns:
      Tuple (crossed columns for Wide, its dimension)
    """
        # Crossed columns
        f_crossed_for_wide = []
        f_crossed_for_deep = []
        for to_cross in self.CROSSED:
            keys = []
            bck_size = 1
            for (key, bck, bnd) in to_cross:
                keys.append(self._prepare_for_crossing(key, bck, bnd))
                bck_size *= bck

            # We can't go crazy on the dim for crossed_column so use a min
            # **0.25 is a rule of thumb for bucket size vs dimension
            t_crossed = tfc.crossed_column(keys, min(bck_size, 10000))
            t_dimension = int(bck_size**0.25)
            f_crossed_for_wide.append(t_crossed)
            f_crossed_for_deep.append(
                tfc.embedding_column(t_crossed, t_dimension))

        return f_crossed_for_wide, f_crossed_for_deep
 def _build_census_wide_columns(numeric_range=None):
     base_columns, cross_columns = [], []
     for col in ALI_DISPLAY_ADS_CONFIG['wide_muti_hot_cols']:
         base_columns.append(
             fc.indicator_column(
                 fc.categorical_column_with_hash_bucket(
                     col,
                     hash_bucket_size=1000 if
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] <= 1000 else
                     ALI_DISPLAY_ADS_CONFIG['vocab_size'][col] + 10000)))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_bucket_cols']:
         base_columns.append(
             fc.bucketized_column(fc.numeric_column(col),
                                  boundaries=list(
                                      np.linspace(numeric_range[col][0],
                                                  numeric_range[col][1],
                                                  1000))))
     for col in ALI_DISPLAY_ADS_CONFIG['wide_cross_cols']:
         cross_columns.append(
             fc.indicator_column(
                 fc.crossed_column([col[0], col[1]],
                                   hash_bucket_size=10000)))
     feature_columns = base_columns + cross_columns
     feat_field_size = len(feature_columns)
     return feature_columns, feat_field_size
Esempio n. 6
0
    def data_preprocessing(self):
        """
        batch_size = 5  # 예제를 위해 작은 배치 크기를 사용합니다.
        train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        val_ds = self.df_to_dataset(self.val, shuffle=False, batch_size=batch_size)
        test_ds = self.df_to_dataset(self.test, shuffle=False, batch_size=batch_size)

        for feature_batch, label_batch in train_ds.take(1):
            print('전체 특성:', list(feature_batch.keys()))
            print('나이 특성의 배치:', feature_batch['age'])
            print('타깃의 배치:', label_batch)

        # 특성 열을 시험해 보기 위해 샘플 배치를 만듭니다.
        self.example_batch = next(iter(train_ds))[0]

        age = feature_column.numeric_column("age")
        self.demo(age)
        """
        feature_columns = []

        # 수치형 열
        for header in [
                'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
        ]:
            feature_columns.append(feature_column.numeric_column(header))

        # 버킷형 열
        age = feature_column.numeric_column("age")
        age_buckets = feature_column.bucketized_column(
            age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
        feature_columns.append(age_buckets)

        # 범주형 열
        thal = feature_column.categorical_column_with_vocabulary_list(
            'thal', ['fixed', 'normal', 'reversible'])
        thal_one_hot = feature_column.indicator_column(thal)
        feature_columns.append(thal_one_hot)

        # 임베딩 열
        thal_embedding = feature_column.embedding_column(thal, dimension=8)
        feature_columns.append(thal_embedding)

        # 교차 특성 열
        crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                        hash_bucket_size=1000)
        crossed_feature = feature_column.indicator_column(crossed_feature)
        feature_columns.append(crossed_feature)

        self.feature_layer = layers.DenseFeatures(feature_columns)

        batch_size = 32
        self.train_ds = self.df_to_dataset(self.train, batch_size=batch_size)
        self.val_ds = self.df_to_dataset(self.val,
                                         shuffle=False,
                                         batch_size=batch_size)
        self.test_ds = self.df_to_dataset(self.test,
                                          shuffle=False,
                                          batch_size=batch_size)
Esempio n. 7
0
def create_feature_columns(dataset, embed_size=32, hash_size=10000):
    n_users = dataset.user.nunique()
    n_items = dataset.item.nunique()
    genre_list = dataset.genre1.unique()
    users = fc.categorical_column_with_vocabulary_list("user",
                                                       np.arange(n_users),
                                                       default_value=-1,
                                                       dtype=tf.int64)
    items = fc.categorical_column_with_vocabulary_list("item",
                                                       np.arange(n_items),
                                                       default_value=-1,
                                                       dtype=tf.int64)
    gender = fc.categorical_column_with_vocabulary_list("gender", ["M", "F"])
    age = fc.categorical_column_with_vocabulary_list(
        "age", [1, 18, 25, 35, 45, 50, 56], dtype=tf.int64)
    occupation = fc.categorical_column_with_vocabulary_list("occupation",
                                                            np.arange(21),
                                                            dtype=tf.int64)
    genre1 = fc.categorical_column_with_vocabulary_list("genre1", genre_list)
    genre2 = fc.categorical_column_with_vocabulary_list("genre2", genre_list)
    genre3 = fc.categorical_column_with_vocabulary_list("genre3", genre_list)

    wide_cols = [
        users, items, gender, age, occupation, genre1, genre2, genre3,
        fc.crossed_column([gender, age, occupation],
                          hash_bucket_size=hash_size),
        fc.crossed_column([age, genre1], hash_bucket_size=hash_size)
    ]

    embed_cols = [users, items, age, occupation]
    deep_cols = list()
    for col in embed_cols:
        deep_cols.append(fc.embedding_column(col, embed_size))

    shared_embed_cols = [genre1, genre2, genre3]
    deep_cols.extend(fc.shared_embedding_columns(shared_embed_cols,
                                                 embed_size))
    deep_cols.append(fc.indicator_column(gender))

    label = fc.numeric_column("label", default_value=0.0, dtype=tf.float32)
    feat_columns = [label]
    feat_columns += wide_cols
    feat_columns += deep_cols
    feat_spec = fc.make_parse_example_spec(feat_columns)
    return wide_cols, deep_cols, feat_spec
Esempio n. 8
0
def create_feature_layer(df):
    week = feature_column.numeric_column("Week")
    boundaries = []
    for i in range(1, 53):
        boundaries.append(i)
    week = feature_column.bucketized_column(week, boundaries=boundaries)
    day = feature_column.numeric_column("Day")
    boundaries = []
    for i in range(1, 8):
        boundaries.append(i)
    day = feature_column.bucketized_column(day, boundaries=boundaries)
    year = feature_column.numeric_column("Year")
    boundaries = []
    for i in range(2013, 2017):
        boundaries.append(i)
    year = feature_column.bucketized_column(year, boundaries=boundaries)
    hour = feature_column.numeric_column("std_hour")
    boundaries = []
    for i in range(0, 24):
        boundaries.append(i)
    hour = feature_column.bucketized_column(hour, boundaries=boundaries)
    arrival = feature_column.categorical_column_with_vocabulary_list(
        "Arrival", vocabulary_list=pd.Series.unique(df.Arrival).tolist())
    airline = feature_column.categorical_column_with_vocabulary_list(
        "Airline", vocabulary_list=pd.Series.unique(df.Airline).tolist())
    flight_no = feature_column.categorical_column_with_vocabulary_list(
        "flight_no", vocabulary_list=pd.Series.unique(df.flight_no).tolist())
    arrival_one_hot = feature_column.indicator_column(arrival)
    airline_one_hot = feature_column.indicator_column(airline)
    flight_no_one_hot = feature_column.indicator_column(flight_no)
    arrival_length = len(pd.Series.unique(df.Arrival).tolist())
    arrival_and_week = feature_column.crossed_column(
        [arrival, week], hash_bucket_size=(arrival_length * 52))
    arrival_and_week = feature_column.indicator_column(arrival_and_week)
    airline_length = len(pd.Series.unique(df.Airline).tolist())
    year_and_airline = feature_column.crossed_column(
        [year, airline], hash_bucket_size=(airline_length * 4))
    year_and_airline = feature_column.indicator_column(year_and_airline)
    feature_columns = []
    feature_columns = feature_columns + [
        week, arrival_one_hot, airline_one_hot, flight_no_one_hot, hour,
        arrival_and_week, year, year_and_airline
    ]
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    return feature_layer
Esempio n. 9
0
 def crossed_feature_columns(self,
                             columns_crossed,
                             nameOfLayer,
                             bucket_size=10):
     crossed_feature = feature_column.crossed_column(
         columns_crossed, hash_bucket_size=bucket_size)
     crossed_feature = feature_column.indicator_column(crossed_feature)
     self.sparse_columns[nameOfLayer] = crossed_feature
     return crossed_feature
Esempio n. 10
0
def get_feature_columns(dataframe):
    """Creates feature columns from pd.DataFrame."""
    feature_columns = []
    feature_layer_inputs = {}

    # numeric cols
    for col_name in ['PhotoAmt', 'Fee', 'Age']:
        feature_columns.append(feature_column.numeric_column(col_name))
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name)

    # bucketized cols
    age = feature_column.numeric_column('Age')
    age_buckets = feature_column.bucketized_column(age,
                                                   boundaries=[1, 2, 3, 4, 5])
    feature_columns.append(age_buckets)

    # indicator_columns
    indicator_column_names = [
        'Type', 'Color1', 'Color2', 'Gender', 'MaturitySize', 'FurLength',
        'Vaccinated', 'Sterilized', 'Health'
    ]
    for col_name in indicator_column_names:
        categorical_column = feature_column.categorical_column_with_vocabulary_list(
            col_name, dataframe[col_name].unique())
        indicator_column = feature_column.indicator_column(categorical_column)
        feature_columns.append(indicator_column)
        feature_layer_inputs[col_name] = tf.keras.Input(shape=(1, ),
                                                        name=col_name,
                                                        dtype=tf.string)

    # embedding columns
    breed1 = feature_column.categorical_column_with_vocabulary_list(
        'Breed1', dataframe.Breed1.unique())
    breed1_embedding = feature_column.embedding_column(breed1, dimension=16)
    feature_columns.append(breed1_embedding)
    feature_layer_inputs['Breed1'] = tf.keras.Input(shape=(1, ),
                                                    name='Breed1',
                                                    dtype=tf.string)

    # crossed columns
    animal_type = feature_column.categorical_column_with_vocabulary_list(
        'Type', ['Cat', 'Dog'])
    feature_columns.append(feature_column.indicator_column(animal_type))
    age_type_feature = feature_column.crossed_column(
        [age_buckets, animal_type], hash_bucket_size=100)
    feature_columns.append(feature_column.indicator_column(age_type_feature))
    feature_layer_inputs['Type'] = tf.keras.Input(shape=(1, ),
                                                  name='Type',
                                                  dtype=tf.string)

    return feature_columns, feature_layer_inputs
Esempio n. 11
0
def classify_data(batch_size=5):
    from tensorflow import feature_column
    from tensorflow.keras import layers
    from sklearn.model_selection import train_test_split
    URL = 'https://storage.googleapis.com/applied-dl/heart.csv'
    dataframe = pd.read_csv(URL)
    tr, te = train_test_split(dataframe, test_size=0.2)
    tr, va = train_test_split(tr, test_size=0.2)
    print(len(tr), len(va), len(te))

    def df_to_dataset(dataframe, shuffle=True, batch_size=32):
        dataframe, labels = dataframe.copy(), dataframe.pop('target')
        ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
        if shuffle:
            ds = ds.shuffle(buffer_size=len(dataframe)).batch(batch_size)
        return ds

    tr_ds = df_to_dataset(tr, batch_size=batch_size)
    va_ds = df_to_dataset(va, shuffle=False, batch_size=batch_size)
    te_ds = df_to_dataset(te, shuffle=False, batch_size=batch_size)
    feature_columns = []
    for header in [
            'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
    ]:
        feature_columns.append(feature_column.numeric_column(header))
    age = feature_column.numeric_column('age')
    age_buckets = feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    feature_columns.append(age_buckets)
    thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
    feature_columns.append(feature_column.indicator_column(thal))
    feature_columns.append(feature_column.embedding_column(thal, dimension=8))
    crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                    hash_bucket_size=1000)
    feature_columns.append(feature_column.indicator_column(crossed_feature))
    feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
    model = tf.keras.Sequential([
        feature_layer,
        layers.Dense(128, activation='relu'),
        layers.Dense(128, activation='relu'),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'])
    model.fit(tr_ds, validation_data=va_ds, epochs=5)
    loss, accuracy = model.evaluate(te_ds)
    print(accuracy)
Esempio n. 12
0
def cross_all_columns():
    all = categorical_names + integer_names
    fcs = []
    fck = {}
    # probably way cleaner way to do this
    # set key indicating pairs have been crossed
    # if not crossed, append to fcs, and set true
    for n1 in all:
        for n2 in all:
            k1 = "%s%s" % (n1, n2)
            k2 = "%s%s" % (n2, n1)
            if fck.get(k1) is None and fck.get(k2) is None:
                fcs.append(crossed_column([n1, n2], 1e6))
                fck[k1] = True
                fck[k2] = True

    return fcs
Esempio n. 13
0
def build_feature_columns():
    age = feature_column.numeric_column('age')
    age_bucket = feature_column.bucketized_column(age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    workclass = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('workclass',
                ['Private', 'Self-emp-not-inc', 'Self-emp-inc', 'Federal-gov', 
                'Local-gov', 'State-gov', 'Without-pay', 'Never-worked']))
    fnlwgt = feature_column.numeric_column('fnlwgt')
    education = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('education',
                ['Bachelors', 'Some-college', '11th', 'HS-grad', 'Prof-school',
                 'Assoc-acdm', 'Assoc-voc', '9th', '7th-8th', '12th', 'Masters',
                 '1st-4th', '10th', 'Doctorate', '5th-6th', 'Preschool']))
    education_num = feature_column.numeric_column('education_num')
    marital_status = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('marital_status',
                ['Married-civ-spouse', 'Divorced', 'Never-married', 'Separated', 'Widowed',
                 'Married-spouse-absent', 'Married-AF-spouse']))
    occupation = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('occupation',
                ['Tech-support', 'Craft-repair', 'Other-service', 'Sales', 'Exec-managerial',
                 'Prof-specialty', 'Handlers-cleaners', 'Machine-op-inspct', 'Adm-clerical', 
                 'Farming-fishing', 'Transport-moving', 'Priv-house-serv', 'Protective-serv', 
                 'Armed-Forces']))
    relationship = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('relationship',
                ['Wife', 'Own-child', 'Husband', 'Not-in-family', 'Other-relative', 'Unmarried']))
    race = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('race', 
                ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']))
    gender = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('gender', 
                ['Female', 'Male']))    
    capital_gain = feature_column.numeric_column('capital_gain') 
    capital_loss = feature_column.numeric_column('capital_loss')
    hours_per_week = feature_column.numeric_column('hours_per_week')
    native_country = feature_column.indicator_column(feature_column.categorical_column_with_vocabulary_list('native_country',
                ['United-States', 'Cambodia', 'England', 'Puerto-Rico', 'Canada', 'Germany',
                 'Outlying-US(Guam-USVI-etc)', 'India', 'Japan', 'Greece', 'South', 'China', 
                 'Cuba', 'Iran', 'Honduras', 'Philippines', 'Italy', 'Poland', 'Jamaica', 'Vietnam',
                 'Mexico', 'Portugal', 'Ireland', 'France', 'Dominican-Republic', 'Laos', 'Ecuador',
                 'Taiwan', 'Haiti', 'Columbia', 'Hungary', 'Guatemala', 'Nicaragua', 'Scotland',
                 'Thailand', 'Yugoslavia', 'El-Salvador', 'Trinadad&Tobago', 'Peru', 'Hong', 
                 'Holand-Netherlands']))
    wide = [age, workclass]    
    deep = [age, workclass, education, education_num, marital_status, occupation, relationship, race, gender, native_country]
    race_gender = feature_column.indicator_column(feature_column.crossed_column([
        feature_column.categorical_column_with_vocabulary_list('race', ['White', 'Asian-Pac-Islander', 'Amer-Indian-Eskimo', 'Other', 'Black']),
        feature_column.categorical_column_with_vocabulary_list('gender', ['Female', 'Male']) ], hash_bucket_size=10))

    wide = [age_bucket, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country, race_gender]    
    deep = [age, workclass, fnlwgt, education, education_num, occupation, relationship, race, gender, capital_gain, capital_loss, hours_per_week, native_country]
    return (wide, deep)
Esempio n. 14
0
    def transform(self, output_tensors):
        input_tensor_name = self.parameters.get("input_tensor")
        output_tensor_name = self.parameters.get("output_tensor")
        if self.parameters.has_key("hash_bucket_size"):
            hash_bucket_size = self.parameters.get("hash_bucket_size")
        else:
            msg = "parameters error, crossed_column must need hash_bucket_size"
            logger.error(msg)
            raise ParametersError(msg)
        column_names = input_tensor_name.split(",")
        columns = []
        for index in range(len(column_names)):
            input_tensor = output_tensors.get(column_names[index])
            columns.append(input_tensor)
        # combiner = self.parameters.get("combiner") if self.parameters.has_key("combiner") else "mean"
        output_tensor = fc.crossed_column(keys=columns,
                                          hash_bucket_size=hash_bucket_size)

        output_tensors[output_tensor_name] = output_tensor
Esempio n. 15
0
def test_crossed_column():
    featrues = {
        'price': [['A', 'A'], ['B', 'D'], ['C', 'A']],
        'color': [['R', 'R'], ['G', 'G'], ['B', 'B']]
    }

    price = feature_column.categorical_column_with_vocabulary_list(
        'price', ['A', 'B', 'C', 'D'])
    color = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'])
    p_x_c = feature_column.crossed_column([price, color], 16)
    p_x_c_identy = feature_column.indicator_column(p_x_c)  # what's this?
    p_x_c_identy_dense_tensor = feature_column.input_layer(
        featrues, [p_x_c_identy])

    with tf.Session() as session:
        session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print('use input_layer' + '_' * 40)
        print(session.run([p_x_c_identy_dense_tensor]))
def build_model():
    feature_columns = []

    for header in [
            'age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'slope', 'ca'
    ]:
        feature_columns.append(feature_column.numeric_column(header))

    age = feature_column.numeric_column("age")
    age_buckets = feature_column.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    feature_columns.append(age_buckets)

    thal = feature_column.categorical_column_with_vocabulary_list(
        'thal', ['fixed', 'normal', 'reversible'])
    thal_one_hot = feature_column.indicator_column(thal)
    feature_columns.append(thal_one_hot)

    thal_embedding = feature_column.embedding_column(thal, dimension=8)
    feature_columns.append(thal_embedding)

    crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                    hash_bucket_size=1000)
    crossed_feature = feature_column.indicator_column(crossed_feature)
    feature_columns.append(crossed_feature)

    feature_layer = keras.layers.DenseFeatures(feature_columns)

    model = tf.keras.Sequential([
        feature_layer,
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam',
                  loss='binary_crossentropy',
                  metrics=['accuracy'],
                  run_eagerly=True)

    return model
Esempio n. 17
0
    def build(self, columns_dict):
        """ 组建crossed_column

        :param columns_dict: 已组建的类表{name: columns}
        :return:
        """

        if not self.keys:
            columns = [columns_dict[k] for k in self.key_names]
            invalid_cols = [
                f for f in columns if not isinstance(f, CategoricalColumn)
            ]
            if invalid_cols:
                raise ValueError(
                    "{} are not CategoricalColumn".format(invalid_cols))
            self.keys = [c.get_input_column() for c in columns]

        if not self._feature_column:
            self._feature_column = fc.crossed_column(self.keys,
                                                     self.hash_bucket_size)
        return self
Esempio n. 18
0
def test_crossed_column():
    # 1. Input features
    featrues = {
        'price': [['A'], ['B'], ['C'], ['C']],
        'color': [['R'], ['G'], ['B'], ['B']]
    }
    # 2. Feature columns (Sparse)
    price = feature_column.categorical_column_with_vocabulary_list(
        'price', ['A', 'B', 'C', 'D'])
    # 2. Feature columns (Sparse)
    color = feature_column.categorical_column_with_vocabulary_list(
        'color', ['R', 'G', 'B'])
    # 2. Feature columns (Sparse)
    p_x_c = feature_column.crossed_column([price, color], 16)
    # 2. Feature columns (Dense)
    p_x_c_identity = feature_column.indicator_column(p_x_c)
    # 3. Feature tensor
    p_x_c_identity_dense_tensor = feature_column.input_layer(
        featrues, [p_x_c_identity])
    with tf.Session() as session:
        #session.run(tf.global_variables_initializer())
        session.run(tf.tables_initializer())
        print(session.run([p_x_c_identity_dense_tensor]))
Esempio n. 19
0
age_buckets = feature_column.bucketized_column(
    age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
feature_columns.append(age_buckets)

# indicator cols
thal = feature_column.categorical_column_with_vocabulary_list(
    'thal', ['fixed', 'normal', 'reversible'])
thal_one_hot = feature_column.indicator_column(thal)
feature_columns.append(thal_one_hot)

# embedding cols
thal_embedding = feature_column.embedding_column(thal, dimension=8)
feature_columns.append(thal_embedding)

# crossed cols
crossed_feature = feature_column.crossed_column([age_buckets, thal],
                                                hash_bucket_size=1000)
crossed_feature = feature_column.indicator_column(crossed_feature)
feature_columns.append(crossed_feature)

feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
batch_size = 32
train_ds = df_to_dataset(train, batch_size=batch_size)
val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
test_ds = df_to_dataset(test, shuffle=False, batch_size=batch_size)

model = tf.keras.Sequential([
    feature_layer,
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(128, activation=tf.nn.relu),
    tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)
])
    'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
    '5th-6th', '10th', '1st-4th', 'Preschool', '12th'
])
marital_status = fc.categorical_column_with_vocabulary_list(
    'marital_status', [
        'Married-civ-spouse', 'Divorced', 'Married-spouse-absent',
        'Never-married', 'Separated', 'Married-AF-spouse', 'Widowed'
    ])
workclass = fc.categorical_column_with_vocabulary_list('workclass', [
    'Self-emp-not-inc', 'Private', 'State-gov', 'Federal-gov', 'Local-gov',
    '?', 'Self-emp-inc', 'Without-pay', 'Never-worked'
])
categorical_columns = [
    relationship, occupation, education, marital_status, workclass
]

# crossed
education_x_occupation = fc.crossed_column(['education', 'occupation'],
                                           hash_bucket_size=1000)
crossed_columns = [education_x_occupation]

# train
classifier = tf.estimator.LinearClassifier(
    feature_columns=numeric_columns + categorical_columns + crossed_columns,
    optimizer=tf.train.FtrlOptimizer(learning_rate=0.1,
                                     l1_regularization_strength=0.1,
                                     l2_regularization_strength=0.1))
classifier.train(get_train_dataset)
result = classifier.evaluate(get_test_dataset)
pprint(result)
Esempio n. 21
0
def input_template_feed_keras(Xtrain, cols_type_received, cols_ref, **kw):
    """
       Create sparse data struccture in KERAS  To plug with MODEL:
       No data, just virtual data
    https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb

    :return:
    """
    from tensorflow.feature_column import (categorical_column_with_hash_bucket,
                                           numeric_column, embedding_column,
                                           bucketized_column, crossed_column,
                                           indicator_column)

    if len(cols_ref) <= 1:
        return Xtrain

    dict_sparse, dict_dense = {}, {}
    for cols_groupname in cols_ref:
        assert cols_groupname in cols_type_received, "Error missing colgroup in config data_pars[cols_model_type] "

        if cols_groupname == "cols_sparse":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                m_bucket = min(500, int(Xtrain[coli].nunique()))
                dict_sparse[coli] = categorical_column_with_hash_bucket(
                    coli, hash_bucket_size=m_bucket)

        if cols_groupname == "cols_dense":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                dict_dense[coli] = numeric_column(coli)

        if cols_groupname == "cols_cross":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                m_bucketi = min(500, int(Xtrain[coli[0]].nunique()))
                m_bucketj = min(500, int(Xtrain[coli[1]].nunique()))
                dict_sparse[coli[0] + "-" + coli[1]] = crossed_column(
                    coli[0], coli[1], m_bucketi * m_bucketj)

        if cols_groupname == "cols_discretize":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                bucket_list = np.linspace(min, max, 100).tolist()
                dict_sparse[coli + "_bin"] = bucketized_column(
                    numeric_column(coli), bucket_list)

    #### one-hot encode the sparse columns
    dict_sparse = {
        colname: indicator_column(col)
        for colname, col in dict_sparse.items()
    }

    ### Embed
    dict_embed = {
        'em_{}'.format(colname): embedding_column(col, 10)
        for colname, col in dict_sparse.items()
    }

    dict_dnn = {**dict_embed, **dict_dense}
    dict_linear = {**dict_sparse, **dict_dense}

    return (dict_linear, dict_dnn)
animal_type = feature_column.categorical_column_with_vocabulary_list('Type', ['Cat', 'Dog'])
animal_type_one_hot = feature_column.indicator_column(animal_type)
demo(animal_type_one_hot)

# Notice the input to the embedding column is the categorical column we previously created
breed1 = feature_column.categorical_column_with_vocabulary_list('Breed1', dataframe.Breed1.unique())
breed1_embedding = feature_column.embedding_column(breed1, dimension = 8)
demo(breed1_embedding)

# Hashed feature columns
breed1_hashed = feature_column.categorical_column_with_hash_bucket('Breed1', hash_bucket_size = 10)
demo(feature_column.indicator_column(breed1_hashed))


# Crossed feature columns
crossed_feature = feature_column.crossed_column([age_buckets, animal_type], hash_bucket_size = 10)
demo(feature_column.indicator_column(crossed_feature))
# 위 라인에서 죽음.. 어흐... OverflowError: Python int too large to convert to C long
# 어떻게 고쳐햐 하나..

# Choose which columns to use
feature_columns = []

# numeric cols
for header in ['PhotoAmt', 'Fee', 'Age']:
    feature_columns.append(feature_column.numeric_column(header))

# bucketized cols
age = feature_column.numeric_column('Age')
age_buckets = feature_column.bucketized_column(age, boundaries = [1, 2, 3, 4, 5])
feature_columns.append(age_buckets)
Esempio n. 23
0
# sex_fare_cross = feature_column.crossed_column([sex, fare_buckets], hash_bucket_size=1000)
# feature_columns.append(feature_column.indicator_column(sex_fare_cross))

# fare_pclass_cross = feature_column.crossed_column([fare_buckets, Pclass], hash_bucket_size=1000)
# feature_columns.append(feature_column.indicator_column(fare_pclass_cross))

# embarked_fare_cross = feature_column.crossed_column([fare_buckets, embarked], hash_bucket_size=100)
# feature_columns.append(feature_column.indicator_column(embarked_fare_cross))

# age_sib_cross = feature_column.crossed_column([age_buckets, sib_buckets], hash_bucket_size=1000)
# feature_columns.append(feature_column.indicator_column(age_sib_cross))

# age_parch_cross = feature_column.crossed_column([age_buckets, parch_buckets], hash_bucket_size=1000)
# feature_columns.append(feature_column.indicator_column(age_parch_cross))

sex_pclass_cross = feature_column.crossed_column([sex, parch_buckets], hash_bucket_size=1000)
feature_columns.append(feature_column.indicator_column(sex_pclass_cross))

# create feature layer
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)

# %%
# Train Model
# Training settings
BATCH_SIZE = 100
EPOCS = 50
LEARNING_RATE = 0.0001
L2 = 1e-8
train_ds = df_to_dataset(train, True, BATCH_SIZE, label)
test_ds = df_to_dataset(test, False, BATCH_SIZE, label)
submission_ds = df_to_dataset(submission_df, False, BATCH_SIZE, has_label=False)
Esempio n. 24
0
                                                              850])
feature_columns_container.append(fico_num)


institutions = feature_column.categorical_column_with_vocabulary_list(
        'institutionName', [
            'Bank of America', 'Toronto Dominion Bank', 'Citizens Bank', 'Webster Bank',
            'CHASE Bank', 'Citigroup', 'Capital One', 'HSBC Bank USA',
            'State Street Corporation', 'MUFG Union Bank', 'Wells Fargo & Co.', 'Barclays',
            'New York Community Bank', 'CIT Group', 'Santander Bank',
            'Royal Bank of Scotland', 'First Rand Bank', 'Budapest Bank'
            ])
institutions_pos = feature_column.indicator_column(institutions)
feature_columns_container.append(institutions_pos)

crossed_feat = feature_column.crossed_column([age, fico_num], hash_bucket_size = 1000)
crossed_feat = feature_column.indicator_column(crossed_feat)
feature_columns_container.append(crossed_feat)

###########EXAMPLES#######
#numeric column
#age = feature_column.numeric_column("age")

#categorical column with vocabulary list
#thal = feature_column.categorical_column_with_vocabulary_list(
#      'thal', ['fixed', 'normal', 'reversible'])

#bucketized column
#age_buckets = feature_column.bucketized_column(
#   age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
Esempio n. 25
0
def get_dataset_tuple_keras(Xtrain, cols_type_received, cols_ref, **kw):
    """
       Create sparse data struccture from dataframe data  to Feed Keras
    https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb
    :return:
    """
    from tensorflow.feature_column import (categorical_column_with_hash_bucket,
                                           numeric_column, embedding_column,
                                           bucketized_column, crossed_column,
                                           indicator_column)

    if len(cols_ref) <= 1:
        return Xtrain

    dict_sparse, dict_dense = {}, {}
    for cols_groupname in cols_ref:
        assert cols_groupname in cols_type_received, "Error missing colgroup in config data_pars[cols_model_type] "

        if cols_groupname == "cols_sparse":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                m_bucket = min(500, int(Xtrain[coli].nunique()))
                dict_sparse[coli] = categorical_column_with_hash_bucket(
                    coli, hash_bucket_size=m_bucket)

        if cols_groupname == "cols_dense":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                dict_dense[coli] = numeric_column(coli)

        if cols_groupname == "cols_cross":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                m_bucketi = min(500, int(Xtrain[coli[0]].nunique()))
                m_bucketj = min(500, int(Xtrain[coli[1]].nunique()))
                dict_sparse[coli[0] + "-" + coli[1]] = crossed_column(
                    coli[0], coli[1], m_bucketi * m_bucketj)

        if cols_groupname == "cols_discretize":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                bucket_list = np.linspace(min, max, 100).tolist()
                dict_sparse[coli + "_bin"] = bucketized_column(
                    numeric_column(coli), bucket_list)

    #### one-hot encode the sparse columns
    dict_sparse = {
        colname: indicator_column(col)
        for colname, col in dict_sparse.items()
    }

    ### Embed
    dict_embed = {
        'em_{}'.format(colname): embedding_column(col, 10)
        for colname, col in dict_sparse.items()
    }
    dict_dense2 = {**dict_dense, **dict_embed}

    X_tuple = (dict_sparse, dict_dense, dict_dense2)
    return X_tuple

    import tensorflow as tf
    NBUCKETS = 10

    real = {
        colname: tf.feature_column.numeric_column(colname)
        for colname in colnumeric
    }

    inputs = {
        colname: tf.keras.layers.Input(name=colname, shape=(), dtype='float32')
        for colname in real.keys()
    }

    sparse = {
        'carrier':
        tf.feature_column.categorical_column_with_vocabulary_list(
            'carrier',
            vocabulary_list='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split(
                ',')),
        'origin':
        tf.feature_column.categorical_column_with_hash_bucket(
            'origin', hash_bucket_size=1000),
        'dest':
        tf.feature_column.categorical_column_with_hash_bucket(
            'dest', hash_bucket_size=1000)
    }

    inputs.update({
        colname: tf.keras.layers.Input(name=colname, shape=(), dtype='string')
        for colname in sparse.keys()
    })

    latbuckets = np.linspace(20.0, 50.0, NBUCKETS).tolist()  # USA
    lonbuckets = np.linspace(-120.0, -70.0, NBUCKETS).tolist()  # USA
    disc = {}
    disc.update({
        'd_{}'.format(key):
        tf.feature_column.bucketized_column(real[key], latbuckets)
        for key in ['dep_lat', 'arr_lat']
    })
    disc.update({
        'd_{}'.format(key):
        tf.feature_column.bucketized_column(real[key], lonbuckets)
        for key in ['dep_lon', 'arr_lon']
    })

    # cross columns that make sense in combination
    sparse['dep_loc'] = tf.feature_column.crossed_column(
        [disc['d_dep_lat'], disc['d_dep_lon']], NBUCKETS * NBUCKETS)
    sparse['arr_loc'] = tf.feature_column.crossed_column(
        [disc['d_arr_lat'], disc['d_arr_lon']], NBUCKETS * NBUCKETS)
    sparse['dep_arr'] = tf.feature_column.crossed_column(
        [sparse['dep_loc'], sparse['arr_loc']], NBUCKETS**4)
    #sparse['ori_dest'] = tf.feature_column.crossed_column(['origin', 'dest'], hash_bucket_size=1000)

    # embed all the sparse columns
    embed = {
        'embed_{}'.format(colname):
        tf.feature_column.embedding_column(col, 10)
        for colname, col in sparse.items()
    }
    real.update(embed)

    # one-hot encode the sparse columns
    sparse = {
        colname: tf.feature_column.indicator_column(col)
        for colname, col in sparse.items()
    }

    def wide_and_deep_classifier(inputs, linear_feature_columns,
                                 dnn_feature_columns, dnn_hidden_units):
        deep = tf.keras.layers.DenseFeatures(dnn_feature_columns,
                                             name='deep_inputs')(inputs)
        layers = [int(x) for x in dnn_hidden_units.split(',')]
        for layerno, numnodes in enumerate(layers):
            deep = tf.keras.layers.Dense(numnodes,
                                         activation='relu',
                                         name='dnn_{}'.format(layerno +
                                                              1))(deep)
        wide = tf.keras.layers.DenseFeatures(linear_feature_columns,
                                             name='wide_inputs')(inputs)
        both = tf.keras.layers.concatenate([deep, wide], name='both')
        output = tf.keras.layers.Dense(1, activation='sigmoid',
                                       name='pred')(both)
        model = tf.keras.Model(inputs, output)
        model.compile(optimizer='adam',
                      loss='binary_crossentropy',
                      metrics=['accuracy'])
        return model

    DNN_HIDDEN_UNITS = 10
    model = wide_and_deep_classifier(inputs,
                                     linear_feature_columns=sparse.values(),
                                     dnn_feature_columns=real.values(),
                                     dnn_hidden_units=DNN_HIDDEN_UNITS)
    tf.keras.utils.plot_model(model,
                              'flights_model.png',
                              show_shapes=False,
                              rankdir='LR')
    X_tuple = (sparse, real, real)
    return X_tuple
Esempio n. 26
0
def tf_data_create_sparse(cols_type_received: dict = {
    'cols_sparse': ['col1', 'col2'],
    'cols_num': ['cola', 'colb']
},
                          cols_ref: list = ['col_sparse', 'col_num'],
                          Xtrain: pd.DataFrame = None,
                          **kw):
    """

       Create sparse data struccture in KERAS  To plug with MODEL:
       No data, just virtual data
    https://github.com/GoogleCloudPlatform/data-science-on-gcp/blob/master/09_cloudml/flights_model_tf2.ipynb

    :return:
    """
    import tensorflow
    from tensorflow.feature_column import (categorical_column_with_hash_bucket,
                                           numeric_column, embedding_column,
                                           bucketized_column, crossed_column,
                                           indicator_column)

    ### Unique values :
    col_unique = {}

    if Xtrain is not None:
        for coli in cols_type_received['col_sparse']:
            col_unique[coli] = int(Xtrain[coli].nunique())

    dict_cat_sparse, dict_dense = {}, {}
    for cols_groupname in cols_ref:
        assert cols_groupname in cols_type_received, "Error missing colgroup in config data_pars[cols_model_type] "

        if cols_groupname == "cols_sparse":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                m_bucket = min(500, col_unique.get(coli, 500))
                dict_cat_sparse[coli] = categorical_column_with_hash_bucket(
                    coli, hash_bucket_size=m_bucket)

        if cols_groupname == "cols_dense":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                dict_dense[coli] = numeric_column(coli)

        if cols_groupname == "cols_cross":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                m_bucketi = min(500, col_unique.get(coli, 500))
                m_bucketj = min(500, col_unique.get(coli, 500))
                dict_cat_sparse[coli[0] + "-" + coli[1]] = crossed_column(
                    coli[0], coli[1], m_bucketi * m_bucketj)

        if cols_groupname == "cols_discretize":
            col_list = cols_type_received[cols_groupname]
            for coli in col_list:
                bucket_list = np.linspace(min, max, 100).tolist()
                dict_cat_sparse[coli + "_bin"] = bucketized_column(
                    numeric_column(coli), bucket_list)

    #### one-hot encode the sparse columns
    dict_cat_sparse = {
        colname: indicator_column(col)
        for colname, col in dict_cat_sparse.items()
    }

    ### Embed
    dict_cat_embed = {
        'em_{}'.format(colname): embedding_column(col, 10)
        for colname, col in dict_cat_sparse.items()
    }

    #### TO Customisze
    #dict_dnn    = {**dict_cat_embed,  **dict_dense}
    # dict_linear = {**dict_cat_sparse, **dict_dense}

    return dict_cat_sparse, dict_cat_embed, dict_dense,
Esempio n. 27
0
 def _combination():
   education_occupation=fc.crossed_column(['education','occupation'],300)
   education_occupation=fc.indicator_column(education_occupation)
   return [education_occupation]
Esempio n. 28
0
def official_census_feature_columns_config_demo():
    # categorical_column
    gender = fc.categorical_column_with_vocabulary_list(
        "gender", ["Female", "Male"])
    education = fc.categorical_column_with_vocabulary_list(
        "education", [
            "Bachelors", "HS-grad", "11th", "Masters", "9th", "Some-college",
            "Assoc-acdm", "Assoc-voc", "7th-8th", "Doctorate", "Prof-school",
            "5th-6th", "10th", "1st-4th", "Preschool", "12th"
        ])
    marital_status = fc.categorical_column_with_vocabulary_list(
        "marital_status", [
            "Married-civ-spouse", "Divorced", "Married-spouse-absent",
            "Never-married", "Separated", "Married-AF-spouse", "Widowed"
        ])
    relationship = fc.categorical_column_with_vocabulary_list(
        "relationship", [
            "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
            "Other-relative"
        ])
    workclass = fc.categorical_column_with_vocabulary_list(
        "workclass", [
            "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
            "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
        ])

    # To show an example of hashing:
    native_country = fc.categorical_column_with_hash_bucket(
        "native_country", hash_bucket_size=1000)
    occupation = fc.categorical_column_with_hash_bucket("occupation",
                                                        hash_bucket_size=1000)

    # Continuous feature columns.
    age = fc.numeric_column("age")
    education_num = fc.numeric_column("education_num")
    capital_gain = fc.numeric_column("capital_gain")
    capital_loss = fc.numeric_column("capital_loss")
    hours_per_week = fc.numeric_column("hours_per_week")

    # bucketized transformations.
    age_buckets = fc.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

    # Wide columns and deep columns.
    base_columns = [
        gender, education, marital_status, relationship, workclass, occupation,
        native_country, age_buckets
    ]
    crossed_columns = [
        fc.crossed_column(['education', 'occupation'], hash_bucket_size=1000),
        fc.crossed_column([age_buckets, 'education', 'occupation'],
                          hash_bucket_size=1000),
        fc.crossed_column(['native_country', 'occupation'],
                          hash_bucket_size=1000)
    ]
    feature_columns = [
        fc.indicator_column(workclass),
        fc.indicator_column(education),
        fc.indicator_column(gender),
        fc.indicator_column(relationship),
        fc.embedding_column(native_country, dimension=32),
        fc.embedding_column(occupation, dimension=32),
        age,
        education_num,
        capital_gain,
        capital_loss,
        hours_per_week,
    ]
    return feature_columns, base_columns, crossed_columns
Esempio n. 29
0
# ________________________________MODEL___________________________________

# ______wide model start ________

# distance bucket input
distance_bucket_input = Input(shape=(1,), dtype='int64', name='distance_bucket')
unique_distance_buckets = list(range(10))
distance_bucket_column = feature_column.categorical_column_with_vocabulary_list('distance_bucket_input', unique_distance_buckets)

# is city search input
is_city_search_input = Input(shape=(1,), dtype='int64', name='is_city_search')
unique_is_city_search = [0,1]
is_city_search_column = feature_column.categorical_column_with_vocabulary_list('is_city_search_input', unique_is_city_search)

# interation features between distance bucket and city search input
distance_city_cross_feature = feature_column.crossed_column([distance_bucket_column, is_city_search_column], hash_bucket_size=20,hash_key=42)
distance_city_cross_indicator_feature = feature_column.indicator_column(distance_city_cross_feature)
distance_city_cross_dense = layers.DenseFeatures(distance_city_cross_indicator_feature)({'distance_bucket_input':distance_bucket_input,
                                                                                        'is_city_search_input':is_city_search_input})

# create input for hotel type
hotel_type_input = Input(shape=(1,), dtype='int64', name='hotel_type')
hotel_type = feature_column.categorical_column_with_vocabulary_list(
         'hotel_type', unique_hotel_type)

# create hotel type click stream inputs and interaction with candidate hotel type
clicked_hotel_type_input_sparse = {}
crossed_hotel_type_dense = {}
for i in CLICKED_HOTEL_TYPE_COLUMNS:
    clicked_hotel_type_input_sparse[i] = Input(shape=(1,), dtype='int64', name=i)
    categorical_col = feature_column.categorical_column_with_vocabulary_list(
Esempio n. 30
0
def build_census_wide_columns():
    n_range = get_census_numeric_feat_range()
    base_columns = [
        fc.bucketized_column(fc.numeric_column('age'),
                             boundaries=list(
                                 np.linspace(n_range['age'][0],
                                             n_range['age'][1], 1000))),
        fc.bucketized_column(fc.numeric_column('education_num'),
                             boundaries=list(
                                 np.linspace(n_range['education_num'][0],
                                             n_range['education_num'][1],
                                             1000))),
        fc.bucketized_column(fc.numeric_column('capital_gain'),
                             boundaries=list(
                                 np.linspace(n_range['capital_gain'][0],
                                             n_range['capital_gain'][1],
                                             1000))),
        fc.bucketized_column(fc.numeric_column('capital_loss'),
                             boundaries=list(
                                 np.linspace(n_range['capital_loss'][0],
                                             n_range['capital_loss'][1],
                                             1000))),
        fc.bucketized_column(fc.numeric_column('hours_per_week'),
                             boundaries=list(
                                 np.linspace(n_range['hours_per_week'][0],
                                             n_range['hours_per_week'][1],
                                             1000))),
        fc.indicator_column(
            fc.categorical_column_with_hash_bucket('gender',
                                                   hash_bucket_size=1000)),
        fc.indicator_column(
            fc.categorical_column_with_hash_bucket('education',
                                                   hash_bucket_size=1000)),
        fc.indicator_column(
            fc.categorical_column_with_hash_bucket('marital_status',
                                                   hash_bucket_size=1000)),
        fc.indicator_column(
            fc.categorical_column_with_hash_bucket('relationship',
                                                   hash_bucket_size=1000)),
        fc.indicator_column(
            fc.categorical_column_with_hash_bucket('workclass',
                                                   hash_bucket_size=1000)),
        fc.indicator_column(
            fc.categorical_column_with_hash_bucket('native_country',
                                                   hash_bucket_size=1000)),
        fc.indicator_column(
            fc.categorical_column_with_hash_bucket('occupation',
                                                   hash_bucket_size=1000))
    ]
    age_buckets = fc.bucketized_column(
        fc.numeric_column("age"),
        boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    cross_columns = [
        fc.indicator_column(
            fc.crossed_column(["education", "occupation"],
                              hash_bucket_size=1000)),
        fc.indicator_column(
            fc.crossed_column(["native_country", "occupation"],
                              hash_bucket_size=1000)),
        fc.indicator_column(
            fc.crossed_column([age_buckets, "education", "occupation"],
                              hash_bucket_size=1000))
    ]
    feature_columns = base_columns + cross_columns
    feat_field_size = len(feature_columns)
    return feature_columns, feat_field_size