Example #1
0
def build_estimator(model_dir, nbuckets, hidden_units):
  """
     Build an estimator starting from INPUT COLUMNS.
     These include feature transformations and synthetic features.
     The model is a wide-and-deep model.
  """

  # input columns
  (dayofweek, hourofday, latdiff, londiff, euclidean, plon, plat, dlon, dlat, pcount) = INPUT_COLUMNS 

  # bucketize the lats & lons
  latbuckets = np.linspace(38.0, 42.0, nbuckets).tolist()
  lonbuckets = np.linspace(-76.0, -72.0, nbuckets).tolist()
  b_plat = layers.bucketized_column(plat, latbuckets)
  b_dlat = layers.bucketized_column(dlat, latbuckets)
  b_plon = layers.bucketized_column(plon, lonbuckets)
  b_dlon = layers.bucketized_column(dlon, lonbuckets)

  # feature cross
  ploc = layers.crossed_column([b_plat, b_plon], nbuckets*nbuckets)
  dloc = layers.crossed_column([b_dlat, b_dlon], nbuckets*nbuckets)
  pd_pair = layers.crossed_column([ploc, dloc], nbuckets ** 4 )
  day_hr =  layers.crossed_column([dayofweek, hourofday], 24*7)

  # Wide columns and deep columns.
  wide_columns = [
      # feature crosses
      dloc, ploc, pd_pair,
      day_hr,

      # sparse columns
      dayofweek, hourofday,

      # anything with a linear relationship
      pcount 
  ]

  deep_columns = [
      # embedding_column to "group" together ...
      layers.embedding_column(pd_pair, 10),
      layers.embedding_column(day_hr, 10),

      # real_valued_column
      plat, plon, dlat, dlon,
      latdiff, londiff, euclidean
  ]

  return tf.contrib.learn.DNNLinearCombinedRegressor(
      model_dir=model_dir,
      linear_feature_columns=wide_columns,
      dnn_feature_columns=deep_columns,
      dnn_hidden_units=hidden_units or [128, 32, 4])
Example #2
0
def wide_and_deep(output_dir,
                  nbuckets=5,
                  hidden_units='64,16,4',
                  learning_rate=0.01):
    real, sparse = get_features()

    hidden_units = hidden_units.split(',')
    hidden_units = list(map(int, hidden_units))
    print(".........................", hidden_units)

    # bucketise/discretise lat and lon to nbuckets
    latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist()  # USA
    lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist()  # USA

    disc = {}
    disc.update({
        'd_{}'.format(key): tflayers.bucketized_column(real[key], latbuckets) \
        for key in ['dep_lat', 'arr_lat']
    })
    disc.update({
        'd_{}'.format(key): tflayers.bucketized_column(real[key], lonbuckets) \
        for key in ['dep_lon', 'arr_lon']
    })

    # cross columns for new features
    sparse['dep_loc'] = tflayers.crossed_column(
        [disc['d_dep_lat'], disc['d_dep_lon']], nbuckets * nbuckets)
    sparse['arr_loc'] = tflayers.crossed_column(
        [disc['d_arr_lat'], disc['d_arr_lon']], nbuckets * nbuckets)

    sparse['dep_arr'] = tflayers.crossed_column(
        [sparse['dep_loc'], sparse['arr_loc']], nbuckets**4)
    sparse['ori_dest'] = tflayers.crossed_column(
        [sparse['origin'], sparse['dest']], hash_bucket_size=1000)
    # checkpoint
    # create embeddings of all the sparse columns
    embed = {
       colname : create_embed(col) \
          for colname, col in sparse.items()
    }
    real.update(embed)

    estimator = tflearn.DNNLinearCombinedClassifier(
        model_dir=output_dir,
        linear_feature_columns=sparse.values(),
        dnn_feature_columns=real.values(),
        dnn_hidden_units=hidden_units)

    estimator.params["head"]._thresholds = [0.7]

    return estimator
Example #3
0
def wide_and_deep_model(output_dir, nbuckets=5, hidden_units='64,32', learning_rate=0.01):
    real, sparse = get_features()

    # the lat/lon columns can be discretized to yield "air traffic corridors"
    latbuckets = np.linspace(20.0, 50.0, nbuckets).tolist()  # USA
    lonbuckets = np.linspace(-120.0, -70.0, nbuckets).tolist() # USA
    disc = {}
    disc.update({
       'd_{}'.format(key) : tflayers.bucketized_column(real[key], latbuckets) \
          for key in ['dep_lat', 'arr_lat']
    })
    disc.update({
       'd_{}'.format(key) : tflayers.bucketized_column(real[key], lonbuckets) \
          for key in ['dep_lon', 'arr_lon']
    })

    # cross columns that make sense in combination
    sparse['dep_loc'] = tflayers.crossed_column([disc['d_dep_lat'], disc['d_dep_lon']],\
                                                nbuckets*nbuckets)
    sparse['arr_loc'] = tflayers.crossed_column([disc['d_arr_lat'], disc['d_arr_lon']],\
                                                nbuckets*nbuckets)
    sparse['dep_arr'] = tflayers.crossed_column([sparse['dep_loc'], sparse['arr_loc']],\
                                                nbuckets ** 4)
    sparse['ori_dest'] = tflayers.crossed_column([sparse['origin'], sparse['dest']], \
                                                hash_bucket_size=1000)
    
    # create embeddings of all the sparse columns
    embed = {
       colname : create_embed(col) \
          for colname, col in sparse.items()
    }
    real.update(embed)
 
    estimator = \
        tflearn.DNNLinearCombinedClassifier(model_dir=output_dir,
                                           linear_feature_columns=sparse.values(),
                                           dnn_feature_columns=real.values(),
                                           dnn_hidden_units=parse_hidden_units(hidden_units))
                                           #linear_optimizer=tf.train.FtrlOptimizer(learning_rate=learning_rate),
                                           #dnn_optimizer=tf.train.AdagradOptimizer(learning_rate=learning_rate*0.25))
    estimator.params["head"]._thresholds = [0.7]  # FIXME: hack
    return estimator
Example #4
0
def build_estimator(model_dir, embedding_size=8, hidden_units=None):
    (gender, race, education, marital_status, relationship, workclass,
     occupation, native_country, age, education_num, capital_gain,
     capital_loss, hours_per_week) = INPUT_COLUMNS
    """Build an estimator."""
    # Sparse base columns.
    # Reused Transformations.
    age_buckets = layers.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

    # Wide columns and deep columns.
    wide_columns = [
        layers.crossed_column([education, occupation],
                              hash_bucket_size=int(1e4)),
        layers.crossed_column([age_buckets, race, occupation],
                              hash_bucket_size=int(1e6)),
        layers.crossed_column([native_country, occupation],
                              hash_bucket_size=int(1e4)),
        gender,
        native_country,
        education,
        occupation,
        workclass,
        marital_status,
        relationship,
        age_buckets,
    ]

    deep_columns = [
        layers.embedding_column(workclass, dimension=embedding_size),
        layers.embedding_column(education, dimension=embedding_size),
        layers.embedding_column(marital_status, dimension=embedding_size),
        layers.embedding_column(gender, dimension=embedding_size),
        layers.embedding_column(relationship, dimension=embedding_size),
        layers.embedding_column(race, dimension=embedding_size),
        layers.embedding_column(native_country, dimension=embedding_size),
        layers.embedding_column(occupation, dimension=embedding_size),
        age,
        education_num,
        capital_gain,
        capital_loss,
        hours_per_week,
    ]

    return tf.contrib.learn.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units or [100, 70, 50, 25])
def build_estimator(model_dir, model_type):
    """build an estimator"""

    # base sparse feature process
    gender = layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male'])
    education = layers.sparse_column_with_hash_bucket(column_name='education', hash_bucket_size=1000)
    relationship = layers.sparse_column_with_hash_bucket(column_name='relationship', hash_bucket_size=100)
    workclass = layers.sparse_column_with_hash_bucket(column_name='workclass', hash_bucket_size=100)
    occupation = layers.sparse_column_with_hash_bucket(column_name='occupation', hash_bucket_size=1000)
    native_country = layers.sparse_column_with_hash_bucket(column_name='native_country', hash_bucket_size=1000)

    # base continuous feature
    age = layers.real_valued_column(column_name='age')
    education_num = layers.real_valued_column(column_name='education_num')
    capital_gain = layers.real_valued_column(column_name='capital_gain')
    capital_loss = layers.real_valued_column(column_name='capital_loss')
    hours_per_week = layers.real_valued_column(column_name='hours_per_week')

    # transformation.bucketization 将连续变量转化为类别标签。从而提高我们的准确性
    age_bucket = layers.bucketized_column(source_column=age,
                                          boundaries=[18, 25, 30, 35, 40, 45,50, 55, 60, 65])

    # wide columns and deep columns
    # 深度模型使用到的特征和广度模型使用到的特征
    # 广度模型特征只只用到了分类标签
    wide_columns = [gender, native_country, education, relationship, workclass, occupation, age_bucket,
                    layers.crossed_column(columns=[education, occupation], hash_bucket_size=int(1e4)),
                    layers.crossed_column(columns=[age_bucket, education, occupation], hash_bucket_size=int(1e6)),
                    layers.crossed_column(columns=[native_country, occupation], hash_bucket_size=int(1e4))]

    deep_columns = [layers.embedding_column(workclass, dimension=8),
                    layers.embedding_column(education, dimension=8),
                    layers.embedding_column(gender, dimension=8),
                    layers.embedding_column(relationship, dimension=8),
                    layers.embedding_column(native_country, dimension=8),
                    layers.embedding_column(occupation, dimension=8),
                    age, education_num, capital_gain, capital_loss, hours_per_week]

    if model_type == "wide":
        m=learn.LinearClassifier(feature_columns=wide_columns, model_dir=model_dir)
    elif model_type == "deep":
        m=learn.DNNClassifier(feature_columns=deep_columns, model_dir=model_dir, hidden_units=[100, 50])
    else:
        m=learn.DNNLinearCombinedClassifier(model_dir=model_dir,
                                            linear_feature_columns=wide_columns,
                                            dnn_feature_columns=deep_columns,
                                            dnn_hidden_units=[256, 128, 64],
                                            dnn_activation_fn=tf.nn.relu)
    return m
df.info()

#real valued column
id = layers.real_valued_column('id')
type(id)
id.key

#real valued column
fare = layers.real_valued_column('fare')
type(fare)
fare.key

cont_features = ['id', 'fare']

#comprehension for creating all real valued columns once
cont_feature_cols = [layers.real_valued_column(k) for k in cont_features]

#bucketized column
fare_buckets = layers.bucketized_column(fare, boundaries=[15, 30])
type(fare_buckets)
fare_buckets.key

#converting continuous valued feature data to constant tensor
type(df[['id']])
df[['id']].size
type(df[['id']].values)
ct = tf.constant(df[['id']].values)
type(ct)

cont_features_tensor = {k: tf.constant(df[k].values) for k in cont_features}
Example #7
0
from tempfile import mkdtemp

PATH_TO_DIRECTORY_OF_THIS_FILE = dirname(realpath(__file__))
PATH_TO_DIRECTORY_OF_INPUT_DATA = PATH_TO_DIRECTORY_OF_THIS_FILE + "/data/input"
MODEL_DIR = PATH_TO_DIRECTORY_OF_THIS_FILE + "/classifier"

CATEGORICAL_COLUMNS = ["admin_level", "country_code", "edit_distance", "has_mpoly", "has_pcode", "is_country", "is_highest_population", "is_lowest_admin_level", "matches_topic"]
CONTINUOUS_COLUMNS = ["cluster_frequency", "country_rank", "median_distance", "population", "popularity"]
LABEL_COLUMN = "correct"
COLUMNS = sorted(CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS) + [LABEL_COLUMN]
print "COLUMNS:", COLUMNS


admin_level = sparse_column_with_keys(column_name="admin_level", keys=["None","0","1","2","3","4","5","6"]) # I've never seen admin 6, but you never know!
cluster_frequency = real_valued_column("cluster_frequency")
cluster_frequency_buckets = bucketized_column(cluster_frequency, boundaries=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1])
country_code = sparse_column_with_hash_bucket("country_code", hash_bucket_size=500)
country_rank = real_valued_column("country_rank")
edit_distance = sparse_column_with_keys(column_name="edit_distance", keys=["0", "1", "2"])
has_pcode = sparse_column_with_keys(column_name="has_pcode", keys=["True", "False"])
has_mpoly = sparse_column_with_keys(column_name="has_mpoly", keys=["True", "False"])
is_country = sparse_column_with_keys(column_name="is_country", keys=["True", "False"])
is_lowest_admin_level = sparse_column_with_keys(column_name="is_lowest_admin_level", keys=["True", "False"])
is_highest_population = sparse_column_with_keys(column_name="is_highest_population", keys=["True", "False"])
matches_topic = sparse_column_with_keys(column_name="matches_topic", keys=["True", "False"])
median_distance = real_valued_column("median_distance")
median_distance_buckets = bucketized_column(median_distance, boundaries=[10,50,100,200,300])
population = real_valued_column("population")
population_buckets = bucketized_column(population, boundaries=[0, 1, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000])
popularity = real_valued_column("popularity")
admin_level_x_median_distance = crossed_column([admin_level, median_distance_buckets], hash_bucket_size=int(1e4))
Example #8
0
def build_estimator(model_dir, embedding_size=8, hidden_units=None):
    """Build a wide and deep model for predicting income category.

  Wide and deep models use deep neural nets to learn high level abstractions
  about complex features or interactions between such features.
  These models then combined the outputs from the DNN with a linear regression
  performed on simpler features. This provides a balance between power and
  speed that is effective on many structured data problems.

  You can read more about wide and deep models here:
  https://research.googleblog.com/2016/06/wide-deep-learning-better-together-with.html

  To define model we can use the prebuilt DNNCombinedLinearClassifier class,
  and need only define the data transformations particular to our dataset, and then
  assign these (potentially) transformed features to either the DNN, or linear
  regression portion of the model.

  Args:
    model_dir: str, the model directory used by the Classifier for checkpoints
      summaries and exports.
    embedding_size: int, the number of dimensions used to represent categorical
      features when providing them as inputs to the DNN.
    hidden_units: [int], the layer sizes of the DNN (input layer first)
  Returns:
    A DNNCombinedLinearClassifier
  """
    (actividad, anio, bueno, dia, lugar, mes, pais) = INPUT_COLUMNS
    """Build an estimator."""

    # Reused Transformations.
    # Continuous columns can be converted to categorical via bucketization
    mes_bucket = layers.bucketized_column(
        mes, boundaries=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])

    # Wide columns and deep columns.
    wide_columns = [
        # Interactions between different categorical features can also
        # be added as new virtual features.
        layers.crossed_column([actividad, lugar], hash_bucket_size=int(1e4)),
        layers.crossed_column([actividad, mes_bucket],
                              hash_bucket_size=int(1e4)),
        layers.crossed_column([actividad, dia], hash_bucket_size=int(1e4)),
        layers.crossed_column([actividad, pais], hash_bucket_size=int(1e4)),
        actividad,
        dia,
        lugar,
        mes_bucket,
        pais,
    ]

    deep_columns = [
        layers.embedding_column(actividad, dimension=embedding_size),
        layers.embedding_column(lugar, dimension=embedding_size),
        layers.embedding_column(dia, dimension=embedding_size),
        layers.embedding_column(pais, dimension=embedding_size),
        anio,
        mes,
        bueno,
    ]

    return tf.contrib.learn.DNNLinearCombinedClassifier(
        model_dir=model_dir,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units or [100, 70, 50, 25])
Example #9
0
def build_estimator(model_dir, nbuckets, hidden_units):
    """
     Build an estimator starting from INPUT COLUMNS.
     These include feature transformations and synthetic features.
     The model is a wide-and-deep model.
  """

    # input columns
    (dayofweek, hourofday, latdiff, londiff, euclidean, plon, plat, dlon, dlat,
     pcount) = INPUT_COLUMNS

    # bucketize the lats & lons
    latbuckets = np.linspace(38.0, 42.0, nbuckets).tolist()
    lonbuckets = np.linspace(-76.0, -72.0, nbuckets).tolist()
    b_plat = layers.bucketized_column(plat, latbuckets)
    b_dlat = layers.bucketized_column(dlat, latbuckets)
    b_plon = layers.bucketized_column(plon, lonbuckets)
    b_dlon = layers.bucketized_column(dlon, lonbuckets)

    # feature cross
    ploc = layers.crossed_column([b_plat, b_plon], nbuckets * nbuckets)
    dloc = layers.crossed_column([b_dlat, b_dlon], nbuckets * nbuckets)
    pd_pair = layers.crossed_column([ploc, dloc], nbuckets**4)
    day_hr = layers.crossed_column([dayofweek, hourofday], 24 * 7)

    # Wide columns and deep columns.
    wide_columns = [
        # feature crosses
        dloc,
        ploc,
        pd_pair,
        day_hr,

        # sparse columns
        dayofweek,
        hourofday,

        # anything with a linear relationship
        pcount
    ]

    deep_columns = [
        # embedding_column to "group" together ...
        layers.embedding_column(pd_pair, 10),
        layers.embedding_column(day_hr, 10),

        # real_valued_column
        plat,
        plon,
        dlat,
        dlon,
        latdiff,
        londiff,
        euclidean
    ]

    return tf.contrib.learn.DNNLinearCombinedRegressor(
        model_dir=model_dir,
        linear_feature_columns=wide_columns,
        dnn_feature_columns=deep_columns,
        dnn_hidden_units=hidden_units or [128, 32, 4])
    num_call += 1
    print('input function was called %d times' % num_call)
    return {
        'route_quality': tf.constant(df.iloc[:, 'route_quality']),
        'wind_direction': tf.constant(df.iloc[:, 'wind_direction']),
        'wind_speed': tf.constant(df.iloc[:, 'wind_speed']),
        'temperature': tf.constant(df.iloc[:, 'temperature']),
        'precipitation': tf.constant(df.iloc[:, 'precipitation']),
        'weekend': tf.constant(df.iloc[:, 'weekend']),
        'time_of_day': tf.constant(df.iloc[:, 'time_of_day'])
    }, tf.constant(df.iloc[:, 'avg_travel_time'])


route_quality = layers.real_valued_column('route_quality')
wind_direction = layers.real_valued_column('wind_direction')
wind_direction_range = layers.bucketized_column(
    wind_direction, boundaries=[0, 45, 90, 135, 180, 225, 270, 315, 360])
wind_speed = layers.real_valued_column('wind_speed')
temperature = layers.real_valued_column('temperature')
precipitation = layers.real_valued_column('precipitation')
weekend = layers.real_valued_column('weekend')
time_of_day = layers.real_valued_column('time_of_day')
regressor = learn.LinearRegressor(feature_columns=[
    route_quality, wind_direction_range, wind_speed, temperature,
    precipitation, weekend, time_of_day
])


def travel_input_fn_training():
    return travel_input_fn(training_travel_df)

def build_estimator(model_dir, embedding_size=8, hidden_units=None):
  """Build a wide and deep model for predicting income category.

  Wide and deep models use deep neural nets to learn high level abstractions
  about complex features or interactions between such features.
  These models then combined the outputs from the DNN with a linear regression
  performed on simpler features. This provides a balance between power and
  speed that is effective on many structured data problems.

  You can read more about wide and deep models here:
  https://research.googleblog.com/2016/06/wide-deep-learning-better-together-with.html

  To define model we can use the prebuilt DNNCombinedLinearClassifier class,
  and need only define the data transformations particular to our dataset, and then
  assign these (potentially) transformed features to either the DNN, or linear
  regression portion of the model.

  Args:
    model_dir: str, the model directory used by the Classifier for checkpoints
      summaries and exports.
    embedding_size: int, the number of dimensions used to represent categorical
      features when providing them as inputs to the DNN.
    hidden_units: [int], the layer sizes of the DNN (input layer first)
  Returns:
    A DNNCombinedLinearClassifier
  """
  (gender, race, education, marital_status, relationship,
   workclass, occupation, native_country, age,
   education_num, capital_gain, capital_loss, hours_per_week) = INPUT_COLUMNS
  """Build an estimator."""

  # Reused Transformations.
  # Continuous columns can be converted to categorical via bucketization
  age_buckets = layers.bucketized_column(
      age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])

  # Wide columns and deep columns.
  wide_columns = [
      # Interactions between different categorical features can also
      # be added as new virtual features.
      layers.crossed_column(
          [education, occupation], hash_bucket_size=int(1e4)),
      layers.crossed_column(
          [age_buckets, race, occupation], hash_bucket_size=int(1e6)),
      layers.crossed_column(
          [native_country, occupation], hash_bucket_size=int(1e4)),
      gender,
      native_country,
      education,
      occupation,
      workclass,
      marital_status,
      relationship,
      age_buckets,
  ]

  deep_columns = [
      layers.embedding_column(workclass, dimension=embedding_size),
      layers.embedding_column(education, dimension=embedding_size),
      layers.embedding_column(marital_status, dimension=embedding_size),
      layers.embedding_column(gender, dimension=embedding_size),
      layers.embedding_column(relationship, dimension=embedding_size),
      layers.embedding_column(race, dimension=embedding_size),
      layers.embedding_column(native_country, dimension=embedding_size),
      layers.embedding_column(occupation, dimension=embedding_size),
      age,
      education_num,
      capital_gain,
      capital_loss,
      hours_per_week,
  ]

  return tf.contrib.learn.DNNLinearCombinedClassifier(
      model_dir=model_dir,
      linear_feature_columns=wide_columns,
      dnn_feature_columns=deep_columns,
      dnn_hidden_units=hidden_units or [100, 70, 50, 25])
Example #12
0
def gen_feature(feature_conf):
    name = feature_conf[feature_name_key]
    value_type = feature_conf[value_type_key]

    if "vocab_size" in feature_conf:
        id_feature = fc.sparse_column_with_keys(
            column_name=name,
            keys=range(feature_conf['vocab_size']),
            dtype=tf.string)

        return fc._EmbeddingColumn(
            id_feature,
            dimension=feature_conf['embedding_dimension'],
            shared_embedding_name=feature_conf.get(feature_name_key),
        )
    elif "hash_bucket_size" in feature_conf \
            and "embedding_dimension" not in feature_conf:
        if value_type == "Int":
            id_feature = layers.sparse_column_with_integerized_feature(
                column_name=name,
                bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        return id_feature
    elif "embedding_dimension" in feature_conf \
            and "hash_bucket_size" in feature_conf \
            and "boundaries" not in feature_conf \
            and "vocabulary_file" not in feature_conf:
        if value_type == "Int":
            return _EmbeddingColumn(
                sparse_id_column=layers.sparse_column_with_integerized_feature(
                    column_name=name,
                    bucket_size=feature_conf['hash_bucket_size'],
                    combiner=_get_combiner(feature_conf),
                    # use_hashmap=use_hashmap
                ),
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None))
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                # use_hashmap=use_hashmap
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf:
        use_hashmap = feature_conf.get("use_hashmap", False)
        if value_type == "Int":
            raise Exception(
                "embedding with vocabulary_file does not support Int type")
        else:
            id_feature = fc.sparse_column_with_vocabulary_file(
                column_name=name,
                vocabulary_file=feature_conf["vocabulary_file"],
                num_oov_buckets=feature_conf["num_oov_buckets"],
                vocab_size=feature_conf["vocab_size"],
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" in feature_conf:
        return embedding_bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ],
            embedding_dimension=feature_conf["embedding_dimension"],
            max_norm=None,
            shared_name=feature_conf.get('shared_name', None),
            add_random=feature_conf.get('add_random', False))
    elif "embedding_dimension" not in feature_conf \
            and "boundaries" in feature_conf:
        return layers.bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ])
    else:
        return layers.real_valued_column(
            column_name=name,
            dimension=feature_conf.get('dimension', 1),
            default_value=[
                0.0 for _ in range(int(feature_conf.get('dimension', 1)))
            ],
            normalizer=None if 'l2_norm' not in feature_conf else
            lambda x: tf.nn.l2_normalize(x, dim=-1))
Example #13
0
def build_feature_cols():
    # Sparse base columns.
    gender = layers.sparse_column_with_keys(column_name="gender",
                                            keys=["female", "male"])
    race = layers.sparse_column_with_keys(column_name="race",
                                          keys=[
                                              "Amer-Indian-Eskimo",
                                              "Asian-Pac-Islander", "Black",
                                              "Other", "White"
                                          ])

    education = layers.sparse_column_with_hash_bucket("education",
                                                      hash_bucket_size=1000)
    marital_status = layers.sparse_column_with_hash_bucket(
        "marital_status", hash_bucket_size=100)
    relationship = layers.sparse_column_with_hash_bucket("relationship",
                                                         hash_bucket_size=100)
    workclass = layers.sparse_column_with_hash_bucket("workclass",
                                                      hash_bucket_size=100)
    occupation = layers.sparse_column_with_hash_bucket("occupation",
                                                       hash_bucket_size=1000)
    native_country = layers.sparse_column_with_hash_bucket(
        "native_country", hash_bucket_size=1000)

    # Continuous base columns.
    age = layers.real_valued_column("age")
    education_num = layers.real_valued_column("education_num")
    capital_gain = layers.real_valued_column("capital_gain")
    capital_loss = layers.real_valued_column("capital_loss")
    hours_per_week = layers.real_valued_column("hours_per_week")

    # Transformations.
    age_buckets = layers.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    education_occupation = layers.crossed_column([education, occupation],
                                                 hash_bucket_size=int(1e4))
    age_race_occupation = layers.crossed_column(
        [age_buckets, race, occupation], hash_bucket_size=int(1e6))
    country_occupation = layers.crossed_column([native_country, occupation],
                                               hash_bucket_size=int(1e4))

    # Wide columns and deep columns.
    wide_columns = [
        gender, native_country, education, occupation, workclass, race,
        marital_status, relationship, age_buckets, education_occupation,
        age_race_occupation, country_occupation
    ]

    deep_columns = [
        layers.embedding_column(gender, dimension=8),
        layers.embedding_column(native_country, dimension=8),
        layers.embedding_column(education, dimension=8),
        layers.embedding_column(occupation, dimension=8),
        layers.embedding_column(workclass, dimension=8),
        layers.embedding_column(race, dimension=8),
        layers.embedding_column(marital_status, dimension=8),
        layers.embedding_column(relationship, dimension=8),
        # layers.embedding_column(age_buckets, dimension=8),
        layers.embedding_column(education_occupation, dimension=8),
        layers.embedding_column(age_race_occupation, dimension=8),
        layers.embedding_column(country_occupation, dimension=8),
        age,
        education_num,
        capital_gain,
        capital_loss,
        hours_per_week,
    ]

    return wide_columns, deep_columns
Example #14
0
def build_estimator(model_dir, model_type):
    """Build an estimator."""
    # Sparse base columns.
    userID = layers.sparse_column_with_integerized_feature('userID', 2805118)
    creativeID = layers.sparse_column_with_integerized_feature(
        'creativeID', 6582)
    positionID = layers.sparse_column_with_integerized_feature(
        'positionID', 7645)
    adID = layers.sparse_column_with_integerized_feature('adID', 3616)
    camgaignID = layers.sparse_column_with_integerized_feature(
        'camgaignID', 720)
    advertiserID = layers.sparse_column_with_integerized_feature(
        'advertiserID', 91)
    appID = layers.sparse_column_with_integerized_feature('appID', 50)
    sitesetID = layers.sparse_column_with_integerized_feature('sitesetID', 3)
    appCategory = layers.sparse_column_with_integerized_feature(
        'appCategory', 14)
    appPlatform = layers.sparse_column_with_integerized_feature(
        'appPlatform', 2)
    education = layers.sparse_column_with_integerized_feature('education', 8)
    gender = layers.sparse_column_with_integerized_feature('gender', 3)
    haveBaby = layers.sparse_column_with_integerized_feature('haveBaby', 7)
    marriageStatus = layers.sparse_column_with_integerized_feature(
        'marriageStatus', 4)
    positionType = layers.sparse_column_with_integerized_feature(
        'positionType', 6)
    hometown_c = layers.sparse_column_with_integerized_feature(
        'hometown_c', 22)
    hometown_p = layers.sparse_column_with_integerized_feature(
        'hometown_p', 35)
    residence_c = layers.sparse_column_with_integerized_feature(
        'residence_c', 22)
    residence_p = layers.sparse_column_with_integerized_feature(
        'residence_p', 35)
    telecomsOperator = layers.sparse_column_with_integerized_feature(
        'telecomsOperator', 4)
    connectionType = layers.sparse_column_with_integerized_feature(
        'connectionType', 5)
    clickTime_week = layers.sparse_column_with_integerized_feature(
        'clickTime_week', 7)

    # Continuous base columns.
    age = layers.real_valued_column("age")
    inst_app_installed = layers.real_valued_column('inst_app_installed')
    inst_cate_percent = layers.real_valued_column('inst_cate_percent')
    inst_cnt_appcate = layers.real_valued_column('inst_cnt_appcate')
    inst_cnt_installed = layers.real_valued_column('inst_cnt_installed')
    inst_is_installed = layers.real_valued_column('inst_is_installed')
    action_cate = layers.real_valued_column('action_cate')
    action_cate_recent = layers.real_valued_column('action_cate_recent')
    action_installed = layers.real_valued_column('action_installed')
    tt_cnt_appcate = layers.real_valued_column('tt_cnt_appcate')
    tt_is_installed = layers.real_valued_column('tt_is_installed')
    clickTime_day = layers.real_valued_column('clickTime_day')
    clickTime_hour = layers.real_valued_column('clickTime_hour')
    clickTime_minute = layers.real_valued_column('clickTime_minute')

    # Transformations.
    age_buckets = layers.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    inst_app_installed_buckets = layers.bucketized_column(
        inst_app_installed,
        boundaries=[1000, 5000, 10000, 50000, 100000, 500000])
    clickTime_hour_buckets = layers.bucketized_column(
        clickTime_hour, boundaries=[8, 11, 14, 17, 19, 22])

    # Wide columns and deep columns.
    wide_columns = [
        userID,
        creativeID,
        positionID,
        adID,
        camgaignID,
        advertiserID,
        appID,
        sitesetID,
        appCategory,
        appPlatform,
        education,
        gender,
        haveBaby,
        marriageStatus,
        positionType,
        hometown_c,
        hometown_p,
        residence_c,
        residence_p,
        telecomsOperator,
        connectionType,
        clickTime_week,

        # layers.embedding_column(userID, dimension=8),
        # layers.embedding_column(creativeID, dimension=8),
        # layers.embedding_column(positionID, dimension=8),
        # layers.embedding_column(adID, dimension=8),
        # layers.embedding_column(camgaignID, dimension=8),
        # layers.embedding_column(advertiserID, dimension=8),
        # layers.embedding_column(appID, dimension=8),
        # layers.embedding_column(sitesetID, dimension=8),
        # layers.embedding_column(appCategory, dimension=8),
        # layers.embedding_column(appPlatform, dimension=8),
        # layers.embedding_column(education, dimension=8),
        # layers.embedding_column(gender, dimension=8),
        # layers.embedding_column(haveBaby, dimension=8),
        # layers.embedding_column(marriageStatus, dimension=8),
        # layers.embedding_column(positionType, dimension=8),
        # layers.embedding_column(hometown_c, dimension=8),
        # layers.embedding_column(hometown_p, dimension=8),
        # layers.embedding_column(residence_c, dimension=8),
        # layers.embedding_column(residence_p, dimension=8),
        # layers.embedding_column(telecomsOperator, dimension=8),
        # layers.embedding_column(connectionType, dimension=8),
        # layers.embedding_column(clickTime_week, dimension=8),
        # layers.one_hot_column(userID),
        # layers.one_hot_column(creativeID),
        # layers.one_hot_column(positionID),
        # layers.one_hot_column(adID),
        # layers.one_hot_column(camgaignID),
        # layers.one_hot_column(advertiserID),
        # layers.one_hot_column(appID),
        # layers.one_hot_column(sitesetID),
        # layers.one_hot_column(appCategory),
        # layers.one_hot_column(appPlatform),
        # layers.one_hot_column(education),
        # layers.one_hot_column(gender),
        # layers.one_hot_column(haveBaby),
        # layers.one_hot_column(marriageStatus),
        # layers.one_hot_column(positionType),
        # layers.one_hot_column(hometown_c),
        # layers.one_hot_column(hometown_p),
        # layers.one_hot_column(residence_c),
        # layers.one_hot_column(residence_p),
        # layers.one_hot_column(telecomsOperator),
        # layers.one_hot_column(connectionType),
        # layers.one_hot_column(clickTime_week),
        age_buckets,
        clickTime_hour_buckets,
        inst_app_installed_buckets,
    ]

    deep_columns = [
        layers.embedding_column(userID, dimension=8),
        layers.embedding_column(creativeID, dimension=8),
        layers.embedding_column(positionID, dimension=8),
        layers.embedding_column(adID, dimension=8),
        layers.embedding_column(camgaignID, dimension=8),
        layers.embedding_column(advertiserID, dimension=8),
        layers.embedding_column(appID, dimension=8),
        layers.embedding_column(sitesetID, dimension=8),
        layers.embedding_column(appCategory, dimension=8),
        layers.embedding_column(appPlatform, dimension=8),
        layers.embedding_column(education, dimension=8),
        layers.embedding_column(gender, dimension=8),
        layers.embedding_column(haveBaby, dimension=8),
        layers.embedding_column(marriageStatus, dimension=8),
        layers.embedding_column(positionType, dimension=8),
        layers.embedding_column(hometown_c, dimension=8),
        layers.embedding_column(hometown_p, dimension=8),
        layers.embedding_column(residence_c, dimension=8),
        layers.embedding_column(residence_p, dimension=8),
        layers.embedding_column(telecomsOperator, dimension=8),
        layers.embedding_column(connectionType, dimension=8),
        layers.embedding_column(clickTime_week, dimension=8),
        age,
        action_cate,
        action_cate_recent,
        action_installed,
        inst_app_installed,
        inst_cate_percent,
        inst_cnt_appcate,
        inst_cnt_installed,
        inst_is_installed,
        tt_cnt_appcate,
        tt_is_installed,
        clickTime_day,
        clickTime_hour,
        clickTime_minute,
    ]

    if model_type == "wide":
        m = tf.contrib.learn.LinearClassifier(model_dir=model_dir,
                                              feature_columns=wide_columns)
    elif model_type == "deep":
        m = tf.contrib.learn.DNNClassifier(model_dir=model_dir,
                                           feature_columns=deep_columns,
                                           hidden_units=[100, 50])
    else:
        m = tf.contrib.learn.DNNLinearCombinedClassifier(
            model_dir=model_dir,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=[100, 50, 1],
            fix_global_step_increment_bug=True)
    return m