Exemple #1
0
def get_xy_random2(X, y, cols_family={}):
    # X = np.random.rand(100,30)
    # y = np.random.binomial(n=1, p=0.5, size=[100])

    ## PREPROCESSING STEPS
    # change into dataframe
    target = 'y'
    cols      = [str(i) for i in range(X.shape[1])]  # define column pd dataframe, need to be string type
    data      = pd.DataFrame(X, columns=cols)  # need to convert into df, following the step from documentation
    #data['y'] = y

    # define which feature columns sparse or dense type
    # since our data categorize as Dense Features, we define the sparse features as empty list
    #cols_sparse_features = []
    #cols_dense_features  = [str(i) for i in range(X.shape[1])]

    cols_sparse_features = cols_family['colsparse']
    cols_dense_features  = cols_family['coldense']


    # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library
    sparse_feat_l = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
                     for i,feat in enumerate(cols_sparse_features)]
                    
    dense_feat_l       = [DenseFeat(feat, dimension=1) for feat in cols_dense_features]
    feature_col        = sparse_feat_l + dense_feat_l

    linear_feat_col = feature_col  # containing all the features used by linear part of the model
    dnn_feat_col    = feature_col  # containing all the features used by deep part of the model
    feature_names    = get_feature_names(linear_feat_col + dnn_feat_col)

    train_model_input  = {name: data[name] for name in feature_names}
    X_train, y_train   = train_model_input, y.values

    return X_train, y_train, linear_feat_col, dnn_feat_col
Exemple #2
0
def test_long_dense_vector():
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Exemple #3
0
def test_long_dense_vector():
    #构造特征
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    #构造样本
    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])
    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    #创建模型model
    model = DeepFM(feature_columns, feature_columns[:-1])

    # model.summary()
    #tf.keras.utils.plot_model(model, "test_compu")

    #训练模型
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Exemple #4
0
def get_xy_fd(hash_flag=False):
    feature_columns = [SparseFeat('user', 3, embedding_dim=10), SparseFeat(
        'gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8),
                       SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'),
                         maxlen=4, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4,
                         length_name="seq_length")]
    # Notice: History behavior sequence feature name must start with "hist_".
    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])
    seq_length = np.array([3, 3, 2])  # the actual length of the behavior sequence

    feature_dict = {'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id,
                    'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id,
                    'pay_score': pay_score, 'seq_length': seq_length}
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
Exemple #5
0
def get_xy_random():
    X = np.random.rand(100, 30)
    y = np.random.binomial(n=1, p=0.5, size=[100])

    ## PREPROCESSING STEPS
    # change into dataframe
    cols = [str(i) for i in range(X.shape[1])
            ]  # define column pd dataframe, need to be string type
    data = pd.DataFrame(
        X, columns=cols
    )  # need to convert into df, following the step from documentation
    data['y'] = y

    # define which feature columns sparse or dense type
    # since our data categorize as Dense Features, we define the sparse features as empty list
    cols_sparse_features = []
    cols_dense_features = [str(i) for i in range(X.shape[1])]

    # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library
    sparse_feat_l = [
        SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
        for i, feat in enumerate(cols_sparse_features)
    ]

    dense_feat_l = [
        DenseFeat(feat, dimension=1) for feat in cols_dense_features
    ]
    feature_col = sparse_feat_l + dense_feat_l

    linear_feat_col = feature_col  # containing all the features used by linear part of the model
    dnn_feat_col = feature_col  # containing all the features used by deep part of the model
    feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    train_full, test = train_test_split(data,
                                        random_state=2021,
                                        stratify=data['y'])
    train, val = train_test_split(train_full,
                                  random_state=2021,
                                  stratify=train_full['y'])

    train_model_input = {name: train[name] for name in feature_names}
    val_model_input = {name: val[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    target = 'y'
    ## END OF PREPROCESSING STEPS

    X_train, y_train = train_model_input, train[target].values
    X_val, y_val = val_model_input, val[target].values
    X_test, y_test = test_model_input, test[target].values
    return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
Exemple #6
0
    def fit(self, X, y):
        X_ = X.copy()
        self.dense_features = list(X_.columns.difference(self.cat_features))

        logger.debug("MinMaxScaler")
        self.min_max_scaler.fit(X_[self.dense_features])
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])

        self._column_mapping(X_)
        X_.columns = [self.columns_mapping[col] for col in X_.columns]

        self.fixlen_feature_columns = [
            SparseFeat(
                self.columns_mapping[feat],
                vocabulary_size=X_[self.columns_mapping[feat]].max() + 1,
                embedding_dim=4,
            ) for i, feat in enumerate(self.cat_features)
        ] + [
            DenseFeat(
                self.columns_mapping[feat],
                1,
            ) for feat in self.dense_features
        ]
        self.feature_names = get_feature_names(self.fixlen_feature_columns)

        logger.debug("Compile DeepFM model")
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )

        logger.debug("Fit DeepFM")
        train_model_input = {
            name: X_[name].values
            for name in self.feature_names
        }
        self.model.fit(
            train_model_input,
            y,
            batch_size=256,
            epochs=3,
            verbose=2,
            validation_split=0.2,
        )
Exemple #7
0
def run_base_experiment(data_path, dataset_type, model_params, model_type,
                        opt):
    if dataset_type == 'critero':
        data_df, sparse_features, dense_features, target = load_citero_dataset(
            data_path)
    else:
        data_df, sparse_features, dense_features, target = load_taboola_dataset(
            data_path)
    data_df = prepare_data_for_train(data_df, sparse_features, dense_features)
    fixlen_feature_columns = [
        SparseFeat(
            feat, vocabulary_size=data_df[feat].nunique(), embedding_dim=10)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)
    # 3.generate input data for model
    train, test = train_test_split(data_df, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    batch_size = 1024
    # 4.Define Model,train,predict and evaluate
    model = model_type(linear_feature_columns,
                       dnn_feature_columns,
                       seed=1024,
                       **model_params)
    model.compile(
        optimizer=opt,
        loss="binary_crossentropy",
        metrics=['binary_crossentropy', 'accuracy'],
    )
    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=batch_size,
        epochs=10,
        verbose=1,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=batch_size)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
    pass
Exemple #8
0
def get_xy_fd():

    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item_id", "cate_id"]  # 变长特征使用的base稀疏特征
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])

    # 特征名->data输入
    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
Exemple #9
0
def read_data_as_model():
    data = pd.read_csv('GiveMeSomeCredit/cs-training.csv')
    sparse_features = [
        'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate',
        'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'
    ]
    dense_features = [
        'RevolvingUtilizationOfUnsecuredLines', 'age', 'DebtRatio',
        'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
        'NumberRealEstateLoansOrLines'
    ]

    data[sparse_features] = data[sparse_features].fillna(-1, )
    data[dense_features] = data[dense_features].fillna(-1, )
    target = ['SeriousDlqin2yrs']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=1234)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    return train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target
Exemple #10
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_feature_names(feature_columns)
    x = {name: feature_dict[name] for name in feature_names}
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Exemple #11
0
def get_xy_fd():
    feature_columns = [SparseFeat('driver_age', 7, embedding_dim=32),
                       SparseFeat('pax_age', 7, embedding_dim=32),
                       SparseFeat('des_id', 10000, embedding_dim=32),
                       SparseFeat('price_id', 20, embedding_dim=32)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_price_id', vocabulary_size=5, embedding_dim=32), maxlen=3),
        VarLenSparseFeat(SparseFeat('hist_des_id', vocabulary_size=5, embedding_dim=32), maxlen=3)]
    # Notice: History behavior sequence feature name must start with "hist_".
    behavior_feature_list = ["price_id", "des_id"]
    driver_age = np.array([0, 1, 2])
    pax_age = np.array([0, 1, 0])
    pax_des = np.array([1, 2, 3])  # 0 is mask value
    pax_price = np.array([1, 2, 2])  # 0 is mask value

    hist_price_seq = np.array([[1, 2, 3], [3, 2, 1], [1, 2, 0]])
    hist_des_seq = np.array([[1, 2, 2], [2, 2, 1], [1, 2, 0]])

    feature_dict = {'driver_age': driver_age, 'pax_age': pax_age, 'des_id': pax_des, 'price_id': pax_price,
                    'hist_price_id': hist_price_seq, 'hist_des_id': hist_des_seq}
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
Exemple #12
0
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    if use_neg:
        feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3,
                                                      0], [1, 2, 3, 0],
                                                     [1, 2, 0, 0]])
        feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2,
                                                      0], [1, 2, 2, 0],
                                                     [1, 2, 0, 0]])
        feature_columns += [
            VarLenSparseFeat(SparseFeat('neg_hist_item_id',
                                        vocabulary_size=3 + 1,
                                        embedding_dim=8,
                                        embedding_name='item_id'),
                             maxlen=4,
                             length_name="seq_length"),
            VarLenSparseFeat(SparseFeat('neg_hist_cate_id',
                                        2 + 1,
                                        embedding_dim=4,
                                        embedding_name='cate_id'),
                             maxlen=4,
                             length_name="seq_length")
        ]

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
Exemple #13
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, use_hash=hash_flag),
        SparseFeat('gender', 2, use_hash=hash_flag),
        SparseFeat('item', 3 + 1, use_hash=hash_flag),
        SparseFeat('item_gender', 2 + 1, use_hash=hash_flag),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_0_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_0_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_1_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_1_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'sess_0_item': sess1_iid,
        'sess_0_item_gender': sess1_igender,
        'score': score,
        'sess_1_item': sess2_iid,
        'sess_1_item_gender': sess2_igender,
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    x["sess_length"] = sess_number

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Exemple #14
0
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8,
                                           embedding_name='merchant_id'), maxlen=M),
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4,
                                           embedding_name='action_type'), maxlen=M)]
history_features = ['merchant_id', 'action_type']
print(len(feature_columns))

# 使用DIN模型
model = DIN(feature_columns, history_features)
# 使用Adam优化器,二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
# model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"])

# 组装train_model_input,得到feature names,将train_X转换为字典格式
feature_names = list(train_X.columns)
train_model_input = {name: train_X[name].values for name in get_feature_names(feature_columns)}
print("########################################")

# histroy输入必须是二维数组
from tqdm import tqdm

for fea in ['hist_merchant_id', 'hist_action_type']:
    list = []
    for i in tqdm(train_model_input[fea]):
        list.append(i)
    train_model_input[fea] = np.array(list)

history = model.fit(train_model_input, train_y.values, verbose=True, epochs=10, validation_split=0.2, batch_size=512)

# 转换test__model_input
test_data['action_type'] = 3
Exemple #15
0
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4 )
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # list of string

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=10, verbose=2, validation_split=0.2, )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
Exemple #16
0
    print('data.columns', data.columns.tolist())
    print('unique date_: ', data['date_'].unique())

    train = data[data['date_'] < 14]
    val = data[data['date_'] == 14]  # 第14天样本作为验证集
    pretrained_feed_embedding_initializer = tf.initializers.identity(feed_embedding)

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [SparseFeat('feedid', vocabulary_size=data['feedid'].max() + 1, embedding_dim=512,
                                         embeddings_initializer=pretrained_feed_embedding_initializer)] + [
                                 SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim)
                                 for feat in sparse_features if feat is not 'feedid'] + [DenseFeat(feat, 1) for feat in
                                                                                         dense_features]

    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(dnn_feature_columns)

    # 3.generate input data for model
    train_model_input = {name: train[name] for name in feature_names}
    val_model_input = {name: val[name] for name in feature_names}
    userid_list = val['userid'].astype(str).tolist()
    test_model_input = {name: test[name] for name in feature_names}

    train_labels = [train[y].values for y in target]
    val_labels = [val[y].values for y in target]

    # 4.Define Model,train,predict and evaluate
    train_model = MMOE(dnn_feature_columns, num_tasks=4, expert_dim=8, dnn_hidden_units=(128, 128),
                       tasks=['binary', 'binary', 'binary', 'binary'])
    train_model.compile("adagrad", loss='binary_crossentropy')
    # print(train_model.summary())
Exemple #17
0
def get_xy_from_txt(file_path="data/movielens_sample_din.txt"):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    # head = ['label', 'user', 'gender', 'item_id', 'cate_id', 'hist_item_id', 'hist_cate_id', 'pay_score']

    data = pd.read_csv(file_path, delimiter=',')

    def to_int_array(x):
        ret = []
        a = x.split('|')
        for str in a:
            ret.append(int(str))
        return np.array(ret)
        # return ret

    data['hist_item_id'] = data['hist_item_id'].apply(to_int_array)
    data['hist_cate_id'] = data['hist_cate_id'].apply(to_int_array)

    uid = np.array(data['user'])

    ugender = np.array(data['gender'])
    iid = np.array(data['item_id'])  # 0 is mask value
    cate_id = np.array(data['cate_id'])  # 0 is mask value
    pay_score = np.array(data['pay_score'])
    print("hist_cate_id: ", type(data['hist_cate_id']),
          type(data['hist_cate_id'][0]), np.shape(data['hist_cate_id'][0]),
          data['hist_cate_id'])
    print("------------" * 10)
    hist_iid = np.array(data['hist_item_id'].tolist())
    hist_cate_id = np.array(data['hist_cate_id'].tolist())
    print("uid: ", type(uid), uid)
    print("hist_cate_id: ", type(hist_cate_id), type(hist_cate_id[0]),
          np.shape(hist_cate_id[0]), hist_cate_id)
    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array(data.pop('label'))

    return x, y, feature_columns, behavior_feature_list
Exemple #18
0
        untrainable_features_columns = []
    dense_features = []


else:
    print('plz input dataset name')
    sys.exit()

udg_features = 'userId'
target = ['rating']
behavior_feature_list = ['itemId', 'category']

fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique(), embedding_dim=int(sys.argv[5])) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features]
linear_feature_columns = fixlen_feature_columns 
dnn_feature_columns = fixlen_feature_columns
fixlen_feature_names = get_feature_names(fixlen_feature_columns)
train_model_input = {name: train[name] for name in fixlen_feature_names}  
test_model_input = {name: test[name] for name in fixlen_feature_names}

if sys.argv[1] in ['DIEN', 'DIEN_UDG', 'DIN', 'DIN_UDG']:
    test_model_input, test_label, max_len = get_input(test, 0, 'test')
    train_model_input, train_label, _ = get_input(train, max_len, 'train')
    fixlen_feature_columns = [SparseFeat(feat, train[feat].nunique()+1, embedding_dim=int(sys.argv[5])) for feat in sparse_features]
    fixlen_feature_columns += [DenseFeat(feat, 1,) for feat in dense_features]
    fixlen_feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_itemId', train['itemId'].nunique() + 1,
                         embedding_dim = int(sys.argv[5]), embedding_name='itemId'), maxlen=max_len, 
                        length_name='seq_length'),
        VarLenSparseFeat(SparseFeat('hist_category', train['category'].nunique() + 1,
                         embedding_dim = int(sys.argv[5]), embedding_name='category'), maxlen=max_len, 
                        length_name='seq_length'),
Exemple #19
0
def get_xy_dataset(data_sample=None):
    if data_sample == "avazu":
        df = pd.read_csv(
            'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/avazu_sample.txt'
        )
        df['day'] = df['hour'].apply(lambda x: str(x)[4:6])
        df['hour'] = df['hour'].apply(lambda x: str(x)[6:])

        sparse_features = [
            'hour',
            'C1',
            'banner_pos',
            'site_id',
            'site_domain',
            'site_category',
            'app_id',
            'app_domain',
            'app_category',
            'device_id',
            'device_model',
            'device_type',
            'device_conn_type',  # 'device_ip',
            'C14',
            'C15',
            'C16',
            'C17',
            'C18',
            'C19',
            'C20',
            'C21',
        ]

        df[sparse_features] = df[sparse_features].fillna('-1', )
        target = ['click']

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])

        # 2.count #unique features for each sparse field,and record dense feature field name
        field_info = dict(C14='user',
                          C15='user',
                          C16='user',
                          C17='user',
                          C18='user',
                          C19='user',
                          C20='user',
                          C21='user',
                          C1='user',
                          banner_pos='context',
                          site_id='context',
                          site_domain='context',
                          site_category='context',
                          app_id='item',
                          app_domain='item',
                          app_category='item',
                          device_model='user',
                          device_type='user',
                          device_conn_type='context',
                          hour='context',
                          device_id='user')

        fixlen_feat_col = [
            SparseFeat(name,
                       vocabulary_size=df[name].nunique(),
                       embedding_dim=16,
                       use_hash=False,
                       dtype='int32',
                       group_name=field_info[name]) for name in sparse_features
        ]

        dnn_feat_col = fixlen_feat_col
        linear_feat_col = fixlen_feat_col
        feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    elif data_sample == "criteo":
        df = pd.read_csv(
            'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/criteo_sample.txt'
        )
        sparse_features = ['C' + str(i) for i in range(1, 27)]
        dense_features = ['I' + str(i) for i in range(1, 14)]

        df[sparse_features] = df[sparse_features].fillna('-1', )
        df[dense_features] = df[dense_features].fillna(0, )
        target = ['label']

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])
        mms = MinMaxScaler(feature_range=(0, 1))
        df[dense_features] = mms.fit_transform(df[dense_features])

        # 2.count #unique features for each sparse field,and record dense feature field name
        fixlen_feat_col = [
            SparseFeat(
                feat, vocabulary_size=df[feat].nunique(), embedding_dim=4)
            for i, feat in enumerate(sparse_features)
        ] + [DenseFeat(
            feat,
            1,
        ) for feat in dense_features]

        dnn_feat_col = fixlen_feat_col
        linear_feat_col = fixlen_feat_col
        feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    elif data_sample == "movielens":
        df = pd.read_csv(
            "https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/movielens_sample.txt"
        )
        sparse_features = [
            "movie_id", "user_id", "gender", "age", "occupation", "zip"
        ]
        target = ['rating']

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])

        # 2.count #unique features for each sparse field
        fixlen_feat_col = [
            SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
            for feat in sparse_features
        ]
        linear_feat_col = fixlen_feat_col
        dnn_feat_col = fixlen_feat_col
        feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    # 3.generate input data for model
    train_full, test = train_test_split(df,
                                        random_state=2021,
                                        stratify=df[target])
    train, val = train_test_split(train_full,
                                  random_state=2021,
                                  stratify=train_full[target])

    train_model_input = {name: train[name] for name in feature_names}
    val_model_input = {name: val[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    X_train, y_train = train_model_input, train[target].values
    X_val, y_val = val_model_input, val[target].values
    X_test, y_test = test_model_input, test[target].values
    return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
Exemple #20
0
#数据加载
data = pd.read_csv("deepfm_movielens_full.csv")
sparse_features = ["MovieID", "UserID", "Genres", "Age", "OccupationID", "Zip-code"]
target = ['Rating']


# 对特征标签进行编码
for feature in sparse_features:
    lbe = LabelEncoder()
    data[feature] = lbe.fit_transform(data[feature])
# 计算每个特征中的 不同特征值的个数
fixlen_feature_columns = [SparseFeat(feature, data[feature].nunique()) for feature in sparse_features]
print(fixlen_feature_columns)
linear_feature_columns = fixlen_feature_columns
dnn_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 将数据集切分成训练集和测试集
train, test = train_test_split(data, test_size=0.2)
train_model_input = {name:train[name].values for name in feature_names}
test_model_input = {name:test[name].values for name in feature_names}

# 使用DeepFM进行训练
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')
model.compile("adam", "mse", metrics=['mse'], )
history = model.fit(train_model_input, train[target].values, batch_size=256, epochs=1, verbose=True, validation_split=0.2, )
# 使用DeepFM进行预测
pred_ans = model.predict(test_model_input, batch_size=256)
# 输出RMSE或MSE
mse = round(mean_squared_error(test[target].values, pred_ans), 4)
rmse = mse ** 0.5
Exemple #21
0
def main(model_dir, data_dir, train_steps, model_name):
    data = pd.read_csv(os.path.join(data_dir, 'criteo_sample.txt'))

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    if model_name == 'DeepFM':
        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       task='binary')
    elif model_name == 'FNN':
        model = FNN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'WDL':
        model = WDL(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'MLR':
        model = MLR(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'NFM':
        model = NFM(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'DIN':
        model = DIN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'CCPM':
        model = CCPM(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    elif model_name == 'PNN':
        model = PNN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'AFM':
        model = AFM(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'DCN':
        model = DCN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'DIEN':
        model = DIEN(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    elif model_name == 'DSIN':
        model = DSIN(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    elif model_name == 'xDeepFM':
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
    elif model_name == 'AutoInt':
        model = AutoInt(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
    elif model_name == 'ONN':
        model = ONN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'FGCNN':
        model = FGCNN(linear_feature_columns,
                      dnn_feature_columns,
                      task='binary')
    elif model_name == 'FiBiNET':
        model = FiBiNET(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
    elif model_name == 'FLEN':
        model = FLEN(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    else:
        print(model_name + ' is not supported now.')
        return

    gpus = int(os.getenv('SM_NUM_GPUS', '0'))
    print('gpus:', gpus)
    if gpus > 1:
        from tensorflow.keras.utils import multi_gpu_model
        model = multi_gpu_model(model, gpus=gpus)

    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=train_steps,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    try:
        print("test LogLoss", round(log_loss(test[target].values, pred_ans),
                                    4))
    except Exception as e:
        print(e)
    try:
        print("test AUC", round(roc_auc_score(test[target].values, pred_ans),
                                4))
    except Exception as e:
        print(e)

    model.save_weights(os.path.join(model_dir, 'DeepFM_w.h5'))
Exemple #22
0
 def get_feature_names(self):
     return get_feature_names(self.features_linear + self.features_dnn)