Beispiel #1
0
def get_features_raw():
    real = {
      colname : tflayers.real_valued_column(colname) \
          for colname in \
            ('dep_delay,taxiout,distance,avg_dep_delay,avg_arr_delay' + 
             ',dep_lat,dep_lon,arr_lat,arr_lon').split(',')
    }
    sparse = {
      'carrier': tflayers.sparse_column_with_keys('carrier',
                  keys='AS,VX,F9,UA,US,WN,HA,EV,MQ,DL,OO,B6,NK,AA'.split(',')),
      'origin' : tflayers.sparse_column_with_hash_bucket('origin', hash_bucket_size=1000), # FIXME
      'dest'   : tflayers.sparse_column_with_hash_bucket('dest', hash_bucket_size=1000) #FIXME
    }
    return real, sparse
def build_estimator(model_dir, model_type):
    """build an estimator"""

    # base sparse feature process
    gender = layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male'])
    education = layers.sparse_column_with_hash_bucket(column_name='education', hash_bucket_size=1000)
    relationship = layers.sparse_column_with_hash_bucket(column_name='relationship', hash_bucket_size=100)
    workclass = layers.sparse_column_with_hash_bucket(column_name='workclass', hash_bucket_size=100)
    occupation = layers.sparse_column_with_hash_bucket(column_name='occupation', hash_bucket_size=1000)
    native_country = layers.sparse_column_with_hash_bucket(column_name='native_country', hash_bucket_size=1000)

    # base continuous feature
    age = layers.real_valued_column(column_name='age')
    education_num = layers.real_valued_column(column_name='education_num')
    capital_gain = layers.real_valued_column(column_name='capital_gain')
    capital_loss = layers.real_valued_column(column_name='capital_loss')
    hours_per_week = layers.real_valued_column(column_name='hours_per_week')

    # transformation.bucketization 将连续变量转化为类别标签。从而提高我们的准确性
    age_bucket = layers.bucketized_column(source_column=age,
                                          boundaries=[18, 25, 30, 35, 40, 45,50, 55, 60, 65])

    # wide columns and deep columns
    # 深度模型使用到的特征和广度模型使用到的特征
    # 广度模型特征只只用到了分类标签
    wide_columns = [gender, native_country, education, relationship, workclass, occupation, age_bucket,
                    layers.crossed_column(columns=[education, occupation], hash_bucket_size=int(1e4)),
                    layers.crossed_column(columns=[age_bucket, education, occupation], hash_bucket_size=int(1e6)),
                    layers.crossed_column(columns=[native_country, occupation], hash_bucket_size=int(1e4))]

    deep_columns = [layers.embedding_column(workclass, dimension=8),
                    layers.embedding_column(education, dimension=8),
                    layers.embedding_column(gender, dimension=8),
                    layers.embedding_column(relationship, dimension=8),
                    layers.embedding_column(native_country, dimension=8),
                    layers.embedding_column(occupation, dimension=8),
                    age, education_num, capital_gain, capital_loss, hours_per_week]

    if model_type == "wide":
        m=learn.LinearClassifier(feature_columns=wide_columns, model_dir=model_dir)
    elif model_type == "deep":
        m=learn.DNNClassifier(feature_columns=deep_columns, model_dir=model_dir, hidden_units=[100, 50])
    else:
        m=learn.DNNLinearCombinedClassifier(model_dir=model_dir,
                                            linear_feature_columns=wide_columns,
                                            dnn_feature_columns=deep_columns,
                                            dnn_hidden_units=[256, 128, 64],
                                            dnn_activation_fn=tf.nn.relu)
    return m
Beispiel #3
0
def get_wide_deep():
    # define column types
    
    StyleName,quantity, demand, org_ret_price,sell_price, margin, off_orig_retail, total_ots = \
    [ \
    tflayers.sparse_column_with_hash_bucket('Style_Name', hash_bucket_size = 1000),
    tflayers.real_valued_column('Quantity'),
    tflayers.real_valued_column('Demand'),
    tflayers.real_valued_column('Original_Retail_Price'),
    tflayers.real_valued_column('Selling_Price'),
    tflayers.real_valued_column('Margin'),
    tflayers.real_valued_column('off_Orig_Retail'),
    tflayers.real_valued_column('Total_OTS'),
    ]
    # which columns are wide (sparse, linear relationship to output) and which are deep (complex relationship to output?)  
    wide = [StyleName,quantity, demand]
    deep = [\
               org_ret_price,
               sell_price,
               margin,
               off_orig_retail,
               total_ots,
               tflayers.embedding_column(StyleName, 3)
               ]
    return wide, deep
Beispiel #4
0
def generate_tf_columns():
    columns = OrderedDict()
    for fc in args['fc']:
        id_feature = layers.sparse_column_with_hash_bucket(
                            column_name=fc['feature_name'],
                            hash_bucket_size=fc['hash_bucket_size'])

                    
        embedding = layers.embedding_column(
                        id_feature,
                        dimension=fc["embedding_dimension"])
        columns[fc['feature_name']] = embedding
    return columns
Beispiel #5
0
def contrib_learn_classifier_test():
    """Test tf.contrib.learn.DNN_classifier."""
    language_column = layers.sparse_column_with_hash_bucket(
        "language", hash_bucket_size=20)

    feature_columns = [
        layers.embedding_column(language_column, dimension=3),
        layers.real_valued_column("age", dtype=tf.int64)
    ]

    classifier = learn.DNNClassifier(
        n_classes=3,
        feature_columns=feature_columns,
        hidden_units=[100, 100],
        config=learn.RunConfig(tf_random_seed=1,
                               model_dir="../model_saver/estimators/"
                               "DNN_classifier_01"),
        # optimizer=optimizer_exp_decay
    )
    classifier.fit(input_fn=_input_fn, steps=10000)
    print("variables_names:\n", str(classifier.get_variable_names()))
    # scores = classifier.evaluate(input_fn=_input_fn,
    #                              steps=100)
    # print("scores:\n", str(scores))

    scores = classifier.evaluate(
        input_fn=_input_fn,
        steps=100,
        metrics={
            'my_accuracy':
            MetricSpec(metric_fn=metrics.streaming_accuracy,
                       prediction_key="classes"),
            'my_precision':
            MetricSpec(metric_fn=metrics.streaming_precision,
                       prediction_key="classes"),
            'my_recall':
            MetricSpec(metric_fn=metrics.streaming_recall,
                       prediction_key="classes"),
            'my_metric':
            MetricSpec(metric_fn=my_metric_op, prediction_key="classes")
        })
    print("scores:\n", str(scores))

    predictions = classifier.predict(input_fn=_input_fn,
                                     outputs=["classes", "probabilities"])
    print("predictions")
    for prediction in predictions:
        print(prediction)
  def testClassifierWithAndWithoutKernelsNoRealValuedColumns(self):
    """Tests kernels have no effect for non-real valued columns ."""

    def input_fn():
      return {
          'price':
              constant_op.constant([[0.4], [0.6], [0.3]]),
          'country':
              sparse_tensor.SparseTensor(
                  values=['IT', 'US', 'GB'],
                  indices=[[0, 0], [1, 3], [2, 1]],
                  dense_shape=[3, 5]),
      }, constant_op.constant([[1], [0], [1]])

    price = layers.real_valued_column('price')
    country = layers.sparse_column_with_hash_bucket(
        'country', hash_bucket_size=5)

    linear_classifier = kernel_estimators.KernelLinearClassifier(
        feature_columns=[price, country])
    linear_classifier.fit(input_fn=input_fn, steps=100)
    linear_metrics = linear_classifier.evaluate(input_fn=input_fn, steps=1)
    linear_loss = linear_metrics['loss']
    linear_accuracy = linear_metrics['accuracy']

    kernel_mappers = {
        country: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')]
    }

    kernel_linear_classifier = kernel_estimators.KernelLinearClassifier(
        feature_columns=[price, country], kernel_mappers=kernel_mappers)
    kernel_linear_classifier.fit(input_fn=input_fn, steps=100)
    kernel_linear_metrics = kernel_linear_classifier.evaluate(
        input_fn=input_fn, steps=1)
    kernel_linear_loss = kernel_linear_metrics['loss']
    kernel_linear_accuracy = kernel_linear_metrics['accuracy']

    # The kernel mapping is applied to a non-real-valued feature column and so
    # it should have no effect on the model. The loss and accuracy of the
    # "kernelized" model should match the loss and accuracy of the initial model
    # (without kernels).
    self.assertAlmostEqual(linear_loss, kernel_linear_loss, delta=0.01)
    self.assertAlmostEqual(linear_accuracy, kernel_linear_accuracy, delta=0.01)
Beispiel #7
0
    def testClassifierWithAndWithoutKernelsNoRealValuedColumns(self):
        """Tests kernels have no effect for non-real valued columns ."""
        def input_fn():
            return {
                'price':
                constant_op.constant([[0.4], [0.6], [0.3]]),
                'country':
                sparse_tensor.SparseTensor(values=['IT', 'US', 'GB'],
                                           indices=[[0, 0], [1, 3], [2, 1]],
                                           dense_shape=[3, 5]),
            }, constant_op.constant([[1], [0], [1]])

        price = layers.real_valued_column('price')
        country = layers.sparse_column_with_hash_bucket('country',
                                                        hash_bucket_size=5)

        linear_classifier = kernel_estimators.KernelLinearClassifier(
            feature_columns=[price, country])
        linear_classifier.fit(input_fn=input_fn, steps=100)
        linear_metrics = linear_classifier.evaluate(input_fn=input_fn, steps=1)
        linear_loss = linear_metrics['loss']
        linear_accuracy = linear_metrics['accuracy']

        kernel_mappers = {
            country: [RandomFourierFeatureMapper(2, 30, 0.6, 1, 'rffm')]
        }

        kernel_linear_classifier = kernel_estimators.KernelLinearClassifier(
            feature_columns=[price, country], kernel_mappers=kernel_mappers)
        kernel_linear_classifier.fit(input_fn=input_fn, steps=100)
        kernel_linear_metrics = kernel_linear_classifier.evaluate(
            input_fn=input_fn, steps=1)
        kernel_linear_loss = kernel_linear_metrics['loss']
        kernel_linear_accuracy = kernel_linear_metrics['accuracy']

        # The kernel mapping is applied to a non-real-valued feature column and so
        # it should have no effect on the model. The loss and accuracy of the
        # "kernelized" model should match the loss and accuracy of the initial model
        # (without kernels).
        self.assertAlmostEqual(linear_loss, kernel_linear_loss, delta=0.01)
        self.assertAlmostEqual(linear_accuracy,
                               kernel_linear_accuracy,
                               delta=0.01)
def build_estimator(model_dir=MODEL_DIR):
    """
    Build an estimator using
    CONTINTUOUS_COLUMNS, BINARY_COLUMNS and MULTI_CATEGORY_COLUMNS.
    """
    bucketized_columns = \
        [sparse_column_with_hash_bucket(col, 1000)
         for col in MULTI_CATEGORY_COLUMNS] + \
        [sparse_column_with_integerized_feature(col, bucket_size=2)
         for col in BINARY_COLUMNS]

    real_valued_columns = \
        [real_valued_column(col) for col in CONTINUOUS_COLUMNS]

    crossed_columns = \
        []

    # Wide columns and deep columns.
    wide_columns = \
        bucketized_columns + \
        real_valued_columns + \
        crossed_columns

    # embedding columns for hash_bucket columns
    deep_columns = \
        [embedding_column(col, dimension=EMBEDDING_DIMENSION)
         for col in bucketized_columns] + \
        real_valued_columns + \
        crossed_columns

    if MODEL_TYPE == "wide":
        print('Creating wide LinearClassifier model...\n')
        model = tf.contrib.learn.LinearClassifier(
            model_dir=model_dir,
            n_classes=2,
            feature_columns=wide_columns,
            # optimizer=tf.train.GradientDescentOptimizer(
            #     learning_rate=FLAGS.learn_rate)
            # optimizer=tf.train.FtrlOptimizer(
            #     learning_rate=LEARN_RATE,
            #     l1_regularization_strength=0.0,
            #     l2_regularization_strength=0.0),
        )

    elif MODEL_TYPE == "deep":
        print('Creating deep DNNClassifier model...\n')
        model = tf.contrib.learn.DNNClassifier(
            model_dir=model_dir,
            n_classes=2,
            feature_columns=deep_columns,
            hidden_units=HIDDEN_UNITS,
            # optimizer=tf.train.FtrlOptimizer(
            #     learning_rate=LEARN_RATE,
            #     l1_regularization_strength=0.0,
            #     l2_regularization_strength=0.0),
        )
    else:
        print('Creating deepNwide DNNLinearCombinedClassifier model...\n')
        model = tf.contrib.learn.DNNLinearCombinedClassifier(
            model_dir=model_dir,
            n_classes=2,
            linear_feature_columns=wide_columns,
            dnn_feature_columns=deep_columns,
            dnn_hidden_units=HIDDEN_UNITS,
            # optimizer=tf.train.FtrlOptimizer(
            #     learning_rate=LEARN_RATE,
            #     l1_regularization_strength=0.0,
            #     l2_regularization_strength=0.0),
        )

    return model
def get_feature_column():
    feature_name = 'gender'
    sparse_id_column = layers.sparse_column_with_hash_bucket(
        column_name=feature_name, hash_bucket_size=100)
    feature_column = layers.embedding_column(sparse_id_column, dimension=10)
    return feature_column
Beispiel #10
0
PATH_TO_DIRECTORY_OF_THIS_FILE = dirname(realpath(__file__))
PATH_TO_DIRECTORY_OF_INPUT_DATA = PATH_TO_DIRECTORY_OF_THIS_FILE + "/data/input"
MODEL_DIR = PATH_TO_DIRECTORY_OF_THIS_FILE + "/classifier"

CATEGORICAL_COLUMNS = ["admin_level", "country_code", "edit_distance", "has_mpoly", "has_pcode", "is_country", "is_highest_population", "is_lowest_admin_level", "matches_topic"]
CONTINUOUS_COLUMNS = ["cluster_frequency", "country_rank", "median_distance", "population", "popularity"]
LABEL_COLUMN = "correct"
COLUMNS = sorted(CATEGORICAL_COLUMNS + CONTINUOUS_COLUMNS) + [LABEL_COLUMN]
print "COLUMNS:", COLUMNS


admin_level = sparse_column_with_keys(column_name="admin_level", keys=["None","0","1","2","3","4","5","6"]) # I've never seen admin 6, but you never know!
cluster_frequency = real_valued_column("cluster_frequency")
cluster_frequency_buckets = bucketized_column(cluster_frequency, boundaries=[0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1])
country_code = sparse_column_with_hash_bucket("country_code", hash_bucket_size=500)
country_rank = real_valued_column("country_rank")
edit_distance = sparse_column_with_keys(column_name="edit_distance", keys=["0", "1", "2"])
has_pcode = sparse_column_with_keys(column_name="has_pcode", keys=["True", "False"])
has_mpoly = sparse_column_with_keys(column_name="has_mpoly", keys=["True", "False"])
is_country = sparse_column_with_keys(column_name="is_country", keys=["True", "False"])
is_lowest_admin_level = sparse_column_with_keys(column_name="is_lowest_admin_level", keys=["True", "False"])
is_highest_population = sparse_column_with_keys(column_name="is_highest_population", keys=["True", "False"])
matches_topic = sparse_column_with_keys(column_name="matches_topic", keys=["True", "False"])
median_distance = real_valued_column("median_distance")
median_distance_buckets = bucketized_column(median_distance, boundaries=[10,50,100,200,300])
population = real_valued_column("population")
population_buckets = bucketized_column(population, boundaries=[0, 1, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000])
popularity = real_valued_column("popularity")
admin_level_x_median_distance = crossed_column([admin_level, median_distance_buckets], hash_bucket_size=int(1e4))
admin_level_x_cluster_frequency = crossed_column([admin_level, cluster_frequency_buckets], hash_bucket_size=int(1e4))
Beispiel #11
0
CSV_COLUMNS = [
    'actividad', 'anio', 'bueno', 'dia', 'lugar', 'viaje', 'mes', 'pais'
]
CSV_COLUMN_DEFAULTS = [[''], [0], [0], [''], [''], [''], [0], ['']]
LABEL_COLUMN = 'viaje'
LABELS = ['viaja', 'no viaja']
# Define the initial ingestion of each feature used by your model.
# Additionally, provide metadata about the feature.
INPUT_COLUMNS = [
    # Categorical base columns

    # For categorical columns with known values we can provide lists
    # of values ahead of time.

    # Otherwise we can use a hashing function to bucket the categories
    layers.sparse_column_with_hash_bucket('actividad', hash_bucket_size=1000),
    layers.real_valued_column('anio'),
    layers.real_valued_column('bueno'),
    layers.sparse_column_with_hash_bucket('dia', hash_bucket_size=1000),
    layers.sparse_column_with_hash_bucket('lugar', hash_bucket_size=1000),
    layers.real_valued_column('mes'),
    layers.sparse_column_with_hash_bucket('pais', hash_bucket_size=1000),

    # Continuous base columns.
]

UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name
                                     for col in INPUT_COLUMNS
                                     } - {LABEL_COLUMN}

Beispiel #12
0
    def build_network_my(self,
                         num_factor=10,
                         num_factor_mlp=64,
                         hidden_dimension=10,
                         num_neg_sample=30):
        print("my network")
        self.num_neg_sample = num_neg_sample
        self.user_id = tf.placeholder(dtype=tf.string,
                                      shape=[None],
                                      name='user_id')
        self.item_id = tf.placeholder(dtype=tf.string,
                                      shape=[None],
                                      name='item_id')
        ##########################################################################3
        self.target_item_id = tf.placeholder(dtype=tf.string,
                                             shape=[None],
                                             name='target_item_id')
        self.hot_item_id = tf.placeholder(dtype=tf.string,
                                          shape=[None],
                                          name='hot_item_id')
        self.long_item_id = tf.placeholder(dtype=tf.string,
                                           shape=[None],
                                           name='long_item_id')
        ###########################################################################
        self.y = tf.placeholder(dtype=tf.float32, shape=[None], name='y')
        self.par = tf.placeholder(dtype=tf.float32)
        ###################################################################################

        ##################################################################################
        a = {'user': self.user_id}
        b = {'item': self.item_id}
        c = {'item': self.target_item_id}
        d = {'user_low': self.user_id}
        e = {'item_low': self.item_id}
        f = {'item_low': self.target_item_id}
        h = {'item': self.hot_item_id}
        l = {'item': self.long_item_id}
        with tf.variable_scope(name_or_scope='embedding',
                               reuse=tf.AUTO_REUSE) as scope:
            id_feature1 = layers.sparse_column_with_hash_bucket(
                column_name='user',
                hash_bucket_size=190000
                # use_hashmap=use_hashmap
            )

            id_feature2 = layers.sparse_column_with_hash_bucket(
                column_name='item',
                hash_bucket_size=120000
                # use_hashmap=use_hashmap
            )

            shared_embedding_columns1 = layers.embedding_column(
                id_feature1, dimension=64, combiner="mean")
            #
            #
            shared_embedding_columns2 = layers.embedding_column(
                id_feature2, dimension=64, combiner="mean")
            a1 = []
            a1.append(shared_embedding_columns1)
            b1 = []
            b1.append(shared_embedding_columns2)
            #
            mlp_user_latent_factor = layers.input_from_feature_columns(
                a, a1, scope='user')
            mlp_item_latent_factor = layers.input_from_feature_columns(
                b, b1, scope='item')
            mlp_target_item_latent_factor = layers.input_from_feature_columns(
                c, b1, scope='item')
            #########################################################################################
            mlp_hot_item_latent_factor = layers.input_from_feature_columns(
                h, b1, scope='item')
            mlp_long_item_latent_factor = layers.input_from_feature_columns(
                l, b1, scope='item')
            #########################################################################################

            id_feature3 = layers.sparse_column_with_hash_bucket(
                column_name='user_low',
                hash_bucket_size=190000
                # use_hashmap=use_hashmap
            )

            id_feature4 = layers.sparse_column_with_hash_bucket(
                column_name='item_low',
                hash_bucket_size=120000
                # use_hashmap=use_hashmap
            )

            shared_embedding_columns3 = layers.embedding_column(
                id_feature3, dimension=10, combiner="mean")
            #
            #
            shared_embedding_columns4 = layers.embedding_column(
                id_feature4, dimension=10, combiner="mean")
            d1 = []
            d1.append(shared_embedding_columns3)
            e1 = []
            e1.append(shared_embedding_columns4)
            #
            user_latent_factor = layers.input_from_feature_columns(
                d, d1, scope='user_low')
            item_latent_factor = layers.input_from_feature_columns(
                e, e1, scope='item_low')
            target_item_latent_factor = layers.input_from_feature_columns(
                f, e1, scope='item_low')
        ###################################################################################

###################################################################################################

###################################################################################################
        GMF = tf.multiply(user_latent_factor, item_latent_factor)
        #####################################################################
        GMF_target = tf.multiply(user_latent_factor, target_item_latent_factor)
        #####################################################################
        user_feature = self.user_side(mlp_user_latent_factor)
        item_feature = self.item_side(mlp_item_latent_factor)
        #########################################################
        target_item_feature = self.item_side(mlp_target_item_latent_factor,
                                             reuse=True)

        hot_item_feature = self.item_side(mlp_hot_item_latent_factor,
                                          reuse=True)
        long_item_feature = self.item_side(mlp_long_item_latent_factor,
                                           reuse=True)
        #########################################################
        self.pair_loss = 0
        self.resort_item = []
        self.resort_label = []
        for i in range(0, self.batch_size):
            temp1 = []
            temp2 = []

            temp1.append(item_feature[i * self.batch_size:(i + 1) *
                                      self.batch_size, :])
            temp2.append(self.y[i * self.batch_size:(i + 1) * self.batch_size])
            self.resort_item.append(temp1)
            self.resort_label.append(temp2)
        discriminative_loss = []

        for i in range(0, self.batch_size):
            discriminative_loss.append(
                get_center_loss(tf.reshape(self.resort_item[i], (-1, 128)),
                                tf.reshape(self.resort_label[i], (-1, 1)), 2))

        for i in range(0, self.batch_size):
            self.pair_loss = self.pair_loss + discriminative_loss[
                i] / self.batch_size
        # #########################################################
        # self.userF=user_feature
        # self.itemF=item_feature
        # #########################################
        # # self.pred_y = tf.nn.sigmoid(
        # #     tf.reduce_sum( 5 * tf.multiply(user_feature, item_feature),1))
        # ########################################
        self.pred_y = tf.nn.sigmoid(
            tf.reduce_sum(
                tf.concat([GMF, 5 * tf.multiply(user_feature, item_feature)],
                          axis=1), 1))
        # self.pred_long=tf.nn.sigmoid(tf.reduce_sum(tf.concat([GMF,5*tf.multiply(user_feature, target_item_feature)], axis=1), 1))
        avg_GMF = tf.reduce_mean(GMF)
        # avg_GMF=tf.stop_gradient(tf.identity(tf.reduce_mean(GMF)))
        self.pred_long = tf.nn.sigmoid(avg_GMF + tf.reduce_sum(
            5 * tf.multiply(user_feature, target_item_feature), 1))
        # self.pred_y = tf.layers.dense(inputs=tf.concat([GMF, MLP], axis=1), units=1, activation=tf.sigmoid, kernel_initializer=tf.random_normal_initializer, kernel_regularizer= tf.contrib.layers.l2_regularizer(scale=self.reg_rate))

        #Pseudo label
        self.p1 = tf.reshape(
            tf.gather(
                self.pred_long,
                tf.reshape(tf.where(tf.less(self.pred_long, 0.2)), [
                    -1,
                ])), [-1, 1])
        self.p2 = tf.reshape(
            tf.gather(
                self.pred_long,
                tf.reshape(tf.where(tf.greater(self.pred_long, 0.8)), [
                    -1,
                ])), [-1, 1])
        self.tar1 = tf.maximum(
            0.0,
            tf.reduce_mean(-self.p1 * tf.log(
                tf.clip_by_value(self.p1, 0.005, 1))))  #/ self.batch_size
        self.tar2 = tf.maximum(
            0.0,
            tf.reduce_mean(-self.p2 * tf.log(
                tf.clip_by_value(self.p2, 0.005, 1))))  #/ self.batch_size
        self.pseudo_loss = self.tar1 + self.tar2
        # self.loss = - tf.reduce_sum(
        #     self.y * tf.log(self.pred_y + 1e-10) + (1 - self.y) * tf.log(1 - self.pred_y + 1e-10))

        self.loss = tf.reduce_mean(
            tf.nn.sigmoid_cross_entropy_with_logits(logits=self.pred_y,
                                                    labels=self.y))
        self.weight_loss = 0.01 * tf.losses.get_regularization_loss(
        )  #+ self.reg_rate * (
        # tf.nn.l2_loss(self.P) + tf.nn.l2_loss(self.Q) + tf.nn.l2_loss(self.mlp_P) + tf.nn.l2_loss(self.mlp_Q))
        # self.DAloss=tf.maximum(0.0001,KMMD(hot_item_feature,long_item_feature))
        self.DAloss = self.coral_loss(hot_item_feature, long_item_feature)
        # self.optimizer = tf.train.AdagradOptimizer(self.learning_rate).minimize(self.loss)
        # self.total_loss=self.loss+self.weight_loss+100*self.DAloss
        # self.total_loss=self.loss+self.weight_loss+100*self.DAloss
        # self.total_loss = self.loss + self.weight_loss
        # self.total_loss=self.loss+self.weight_loss+100*self.DAloss+0.001*self.par*self.pseudo_loss+0.001*self.par*self.pair_loss
        self.total_loss = self.loss + self.weight_loss + self.A2C_weight * self.DAloss + self.pseudo_weight * self.par * self.pseudo_loss + self.center_weight * self.par * self.pair_loss
        self.optimizer = tf.train.AdamOptimizer(0.0001).minimize(
            self.total_loss)

        return self
Beispiel #13
0
def parse_feature_columns_from_examples_test():
    """Construct examples by tf.train.Example.
     Then, parse feature columns from examples.
     Finally, get input from feature columns.

    Returns:
        The input tensor transformed from examples in defined feature columns
         format.
    """
    language_column = layers.sparse_column_with_hash_bucket(
        "language", hash_bucket_size=20)

    feature_columns = [
        layers.embedding_column(language_column, dimension=3),
        layers.real_valued_column("age", dtype=tf.int64)
    ]
    example1 = tf.train.Example(features=tf.train.Features(
        feature={
            "age":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[18])),
            "language":
            tf.train.Feature(bytes_list=tf.train.BytesList(value=[b"en"]))
        }))
    example2 = tf.train.Example(features=tf.train.Features(
        feature={
            "age":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[20])),
            "language":
            tf.train.Feature(bytes_list=tf.train.BytesList(value=[b"fr"]))
        }))
    example3 = tf.train.Example(features=tf.train.Features(
        feature={
            "age":
            tf.train.Feature(int64_list=tf.train.Int64List(value=[25])),
            "language":
            tf.train.Feature(bytes_list=tf.train.BytesList(value=[b"en"]))
        }))
    examples = [
        example1.SerializeToString(),
        example2.SerializeToString(),
        example3.SerializeToString()
    ]
    print(examples)
    # feature_lists = tf.train.FeatureLists(
    #     feature_list={
    #         "age": tf.train.FeatureList(
    #             feature=[
    #                 tf.train.Feature(int64_list=tf.train.Int64List(value=[18])),
    #                 tf.train.Feature(int64_list=tf.train.Int64List(value=[20])),
    #                 tf.train.Feature(int64_list=tf.train.Int64List(value=[25])),
    #             ]
    #         ),
    #         "language": tf.train.FeatureList(
    #             feature=[
    #                 tf.train.Feature(bytes_list=tf.train.BytesList(value=[
    #                     b"en"])),
    #                 tf.train.Feature(bytes_list=tf.train.BytesList(value=[
    #                     b"fr"])),
    #                 tf.train.Feature(bytes_list=tf.train.BytesList(value=[
    #                     b"zh"]))
    #             ]
    #         )
    #     }
    # )
    # print(feature_lists)
    # serialized = feature_lists.SerializeToString()

    columns_to_tensor = layers.parse_feature_columns_from_examples(
        serialized=examples, feature_columns=feature_columns)
    input_layer = layers.input_from_feature_columns(
        columns_to_tensors=columns_to_tensor, feature_columns=feature_columns)
    print("input_layer:\n", str(input_layer))
    sess = tf.InteractiveSession()
    tf.initialize_all_variables().run(session=sess)
    print(input_layer.eval(session=sess))
'''
    使用TF.learn建立线性或者逻辑回归
'''


def input_fn():
    '''
    构造输入函数
    :return: dict
    '''
    return {
        'age':
        tf.constant([1]),
        'language':
        tf.SparseTensor(values=['english'],
                        indices=[[0, 0]],
                        dense_shape=[1, 1])
    }, tf.constant([[1]])


language = sparse_column_with_hash_bucket('language', 100)
age = real_valued_column('age')

# classifier = LinearClassifier(feature_columns=[age,language])
classifier = LinearClassifier(n_classes=3,
                              optimizer=FtrlOptimizer(learning_rate=0.1),
                              feature_columns=[age, language])
classifier.fit(input_fn=input_fn, steps=100)
print('%s' % classifier.evaluate(input_fn=input_fn, steps=1)['loss'])
print('%s' % classifier.get_variable_names())
Beispiel #15
0
def build_feature_cols():
    # Sparse base columns.
    gender = layers.sparse_column_with_keys(column_name="gender",
                                            keys=["female", "male"])
    race = layers.sparse_column_with_keys(column_name="race",
                                          keys=[
                                              "Amer-Indian-Eskimo",
                                              "Asian-Pac-Islander", "Black",
                                              "Other", "White"
                                          ])

    education = layers.sparse_column_with_hash_bucket("education",
                                                      hash_bucket_size=1000)
    marital_status = layers.sparse_column_with_hash_bucket(
        "marital_status", hash_bucket_size=100)
    relationship = layers.sparse_column_with_hash_bucket("relationship",
                                                         hash_bucket_size=100)
    workclass = layers.sparse_column_with_hash_bucket("workclass",
                                                      hash_bucket_size=100)
    occupation = layers.sparse_column_with_hash_bucket("occupation",
                                                       hash_bucket_size=1000)
    native_country = layers.sparse_column_with_hash_bucket(
        "native_country", hash_bucket_size=1000)

    # Continuous base columns.
    age = layers.real_valued_column("age")
    education_num = layers.real_valued_column("education_num")
    capital_gain = layers.real_valued_column("capital_gain")
    capital_loss = layers.real_valued_column("capital_loss")
    hours_per_week = layers.real_valued_column("hours_per_week")

    # Transformations.
    age_buckets = layers.bucketized_column(
        age, boundaries=[18, 25, 30, 35, 40, 45, 50, 55, 60, 65])
    education_occupation = layers.crossed_column([education, occupation],
                                                 hash_bucket_size=int(1e4))
    age_race_occupation = layers.crossed_column(
        [age_buckets, race, occupation], hash_bucket_size=int(1e6))
    country_occupation = layers.crossed_column([native_country, occupation],
                                               hash_bucket_size=int(1e4))

    # Wide columns and deep columns.
    wide_columns = [
        gender, native_country, education, occupation, workclass, race,
        marital_status, relationship, age_buckets, education_occupation,
        age_race_occupation, country_occupation
    ]

    deep_columns = [
        layers.embedding_column(gender, dimension=8),
        layers.embedding_column(native_country, dimension=8),
        layers.embedding_column(education, dimension=8),
        layers.embedding_column(occupation, dimension=8),
        layers.embedding_column(workclass, dimension=8),
        layers.embedding_column(race, dimension=8),
        layers.embedding_column(marital_status, dimension=8),
        layers.embedding_column(relationship, dimension=8),
        # layers.embedding_column(age_buckets, dimension=8),
        layers.embedding_column(education_occupation, dimension=8),
        layers.embedding_column(age_race_occupation, dimension=8),
        layers.embedding_column(country_occupation, dimension=8),
        age,
        education_num,
        capital_gain,
        capital_loss,
        hours_per_week,
    ]

    return wide_columns, deep_columns
    # of values ahead of time.
    layers.sparse_column_with_keys(column_name='gender', keys=['female', 'male']),

    layers.sparse_column_with_keys(
        column_name='race',
        keys=[
            'Amer-Indian-Eskimo',
            'Asian-Pac-Islander',
            'Black',
            'Other',
            'White'
        ]
    ),

    # Otherwise we can use a hashing function to bucket the categories
    layers.sparse_column_with_hash_bucket('education', hash_bucket_size=1000),
    layers.sparse_column_with_hash_bucket('marital_status', hash_bucket_size=100),
    layers.sparse_column_with_hash_bucket('relationship', hash_bucket_size=100),
    layers.sparse_column_with_hash_bucket('workclass', hash_bucket_size=100),
    layers.sparse_column_with_hash_bucket('occupation', hash_bucket_size=1000),
    layers.sparse_column_with_hash_bucket('native_country', hash_bucket_size=1000),

    # Continuous base columns.
    layers.real_valued_column('age'),
    layers.real_valued_column('education_num'),
    layers.real_valued_column('capital_gain'),
    layers.real_valued_column('capital_loss'),
    layers.real_valued_column('hours_per_week'),
]

UNUSED_COLUMNS = set(CSV_COLUMNS) - {col.name for col in INPUT_COLUMNS} - {LABEL_COLUMN}
Beispiel #17
0
def gen_feature(feature_conf):
    name = feature_conf[feature_name_key]
    value_type = feature_conf[value_type_key]

    if "vocab_size" in feature_conf:
        id_feature = fc.sparse_column_with_keys(
            column_name=name,
            keys=range(feature_conf['vocab_size']),
            dtype=tf.string)

        return fc._EmbeddingColumn(
            id_feature,
            dimension=feature_conf['embedding_dimension'],
            shared_embedding_name=feature_conf.get(feature_name_key),
        )
    elif "hash_bucket_size" in feature_conf \
            and "embedding_dimension" not in feature_conf:
        if value_type == "Int":
            id_feature = layers.sparse_column_with_integerized_feature(
                column_name=name,
                bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                combiner=_get_combiner(feature_conf),
                # use_hashmap=use_hashmap
            )
        return id_feature
    elif "embedding_dimension" in feature_conf \
            and "hash_bucket_size" in feature_conf \
            and "boundaries" not in feature_conf \
            and "vocabulary_file" not in feature_conf:
        if value_type == "Int":
            return _EmbeddingColumn(
                sparse_id_column=layers.sparse_column_with_integerized_feature(
                    column_name=name,
                    bucket_size=feature_conf['hash_bucket_size'],
                    combiner=_get_combiner(feature_conf),
                    # use_hashmap=use_hashmap
                ),
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None))
        else:
            id_feature = layers.sparse_column_with_hash_bucket(
                column_name=name,
                hash_bucket_size=feature_conf['hash_bucket_size'],
                # use_hashmap=use_hashmap
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" not in feature_conf and "vocabulary_file" in feature_conf:
        use_hashmap = feature_conf.get("use_hashmap", False)
        if value_type == "Int":
            raise Exception(
                "embedding with vocabulary_file does not support Int type")
        else:
            id_feature = fc.sparse_column_with_vocabulary_file(
                column_name=name,
                vocabulary_file=feature_conf["vocabulary_file"],
                num_oov_buckets=feature_conf["num_oov_buckets"],
                vocab_size=feature_conf["vocab_size"],
            )
            return _EmbeddingColumn(
                id_feature,
                dimension=feature_conf['embedding_dimension'],
                combiner=_get_combiner(feature_conf),
                shared_embedding_name=feature_conf.get('shared_name', None),
                max_norm=None)
    elif "embedding_dimension" in feature_conf \
            and "boundaries" in feature_conf:
        return embedding_bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ],
            embedding_dimension=feature_conf["embedding_dimension"],
            max_norm=None,
            shared_name=feature_conf.get('shared_name', None),
            add_random=feature_conf.get('add_random', False))
    elif "embedding_dimension" not in feature_conf \
            and "boundaries" in feature_conf:
        return layers.bucketized_column(
            layers.real_valued_column(
                column_name=name,
                dimension=feature_conf.get('dimension', 1),
                default_value=[
                    0.0 for _ in range(int(feature_conf.get('dimension', 1)))
                ]),
            boundaries=[
                float(b) for b in feature_conf['boundaries'].split(',')
            ])
    else:
        return layers.real_valued_column(
            column_name=name,
            dimension=feature_conf.get('dimension', 1),
            default_value=[
                0.0 for _ in range(int(feature_conf.get('dimension', 1)))
            ],
            normalizer=None if 'l2_norm' not in feature_conf else
            lambda x: tf.nn.l2_normalize(x, dim=-1))