Esempio n. 1
0
def test_long_dense_vector():

    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_fixlen_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Esempio n. 2
0
    def load_stats(self):
        fixlen_feature_columns = [SparseFeat(feat, self.cat_meta[feat])
                           for feat in self.sparse_features] + [DenseFeat(feat, 1,)
                          for feat in self.dense_features]

        self.dnn_feature_columns = fixlen_feature_columns
        self.linear_feature_columns = fixlen_feature_columns

        self.fixlen_feature_names = get_fixlen_feature_names(self.linear_feature_columns + self.dnn_feature_columns)
Esempio n. 3
0
def get_xy_fd(hash_flag=False):
    # feature_dim_dict = {"sparse": [SingleFeat('user', 3, hash_flag), SingleFeat(
    #     'gender', 2, hash_flag), SingleFeat('item', 3 + 1, hash_flag), SingleFeat('item_gender', 2 + 1, hash_flag)],
    #                     "dense": [SingleFeat('score', 0)]}

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 0)
    ]
    feature_columns += [
        VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
        VarLenSparseFeat('hist_item_gender',
                         3 + 1,
                         maxlen=4,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in feature_names
         ] + [feature_dict[name] for name in varlen_feature_names]

    # x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in
    #                                                                         feature_dim_dict["dense"]] + [
    #         feature_dict['hist_' + feat] for feat in behavior_feature_list]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Esempio n. 4
0
def get_xy_fd(use_neg=False, hash_flag=False):

    feature_columns = [SparseFeat('user', 3,hash_flag),
                       SparseFeat('gender', 2,hash_flag),
                       SparseFeat('item', 3+1,hash_flag),
                       SparseFeat('item_gender', 2+1,hash_flag),
                       DenseFeat('score', 1)]

    feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    behavior_feature_list = ["item","item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])#0 is mask value
    igender = np.array([1, 2, 1])# 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
    hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])

    behavior_length = np.array([3,3,2])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender,
                    'score': score}

    #x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in
    #                                                                        feature_dim_dict["dense"]] + [
    #        feature_dict['hist_' + feat] for feat in behavior_feature_list]


    if use_neg:
        feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
        feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
        feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]
        #x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list]


    feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    print(varlen_feature_names)
    x = [feature_dict[name] for name in feature_names] + [feature_dict[name] for name in varlen_feature_names]

    x += [behavior_length]
    y = [1, 0, 1]
    print(len(x))
    return x, y, feature_columns, behavior_feature_list
Esempio n. 5
0
def main():

    Use_SF = False
    if len(sys.argv) > 0 and sys.argv[0] == 'SF':
        Use_SF = True

    train, vali, test = GetFeatures(Use_SF)

    feature_count = []
    for feat in sparse_features:
        print("Fitting {}".format(feat))
        labels = {}
        for x in train[feat]:
            if x not in labels:
                labels[x] = len(labels) + 1
        print("Transforming {}".format(feat))
        for df in [train, vali, test]:
            df[feat] = df[feat].map(lambda x: labels.get(x, 0))
        feature_count.append(len(labels) + 1)

    sparse_feature_columns = [
        SparseFeat(f, f_c) for f, f_c in zip(sparse_features, feature_count)
    ]
    dense_feature_columns = [DenseFeat(f, 1) for f in dense_features]
    fixlen_feature_columns = sparse_feature_columns + dense_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                    dnn_feature_columns)

    train_model_input = [train[name] for name in fixlen_feature_names]
    vali_model_input = [vali[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]

    def eval(target):
        model, history = model_generate(train_model_input, train[[target]],
                                        vali_model_input, vali[[target]],
                                        linear_feature_columns,
                                        dnn_feature_columns)
        pred_ans = model.predict(test_model_input, batch_size=256)
        print(target + " test LogLoss",
              round(log_loss(test[target].values, pred_ans), 4))
        print(target + " test AUC",
              round(roc_auc_score(test[target].values, pred_ans), 4))

    for target in targets:
        eval(target)
def read(data, lbe_store):
    # data['time'] = data['time'].apply(lambda x: timestamp(x), convert_dtype='int32')
    sparse_features = ["user_id", "item_id", "item_category", "time"]
    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        data[feat] = lbe_store[feat].transform(data[feat])
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                    dnn_feature_columns)
    data_model_input = [data[name].values for name in fixlen_feature_names]
    return data, data_model_input
Esempio n. 7
0
def get_xy_fd():

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
        VarLenSparseFeat('hist_item_gender',
                         3 + 1,
                         maxlen=4,
                         embedding_name='item_gender')
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    fixlen_feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in fixlen_feature_names
         ] + [feature_dict[name] for name in varlen_feature_names]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
    def get_train_instances(self, train):
        users, checkins, cand_venues, labels = [], [], [], []

        for u in self.trainSeq:
            visited = self.trainSeq[u]
            checkin_ = []
            for v in visited[:-1]:
                checkin_.append(v)
                checkins.extend(sequence.pad_sequences([checkin_[:]], maxlen=self.maxVenue))

            # start from the second venue in user's checkin sequence.
            visited = visited[1:]
            for i in range(len(visited)):
                cand_venues.append(visited[i])
                users.append(u)
                labels.append(1)
                j = np.random.randint(self.uNum)
                # check if j is in training dataset or in user's sequence at state i or not
                while (u, j) in train or j in visited[:i]:
                    j = np.random.randint(self.uNum)

                cand_venues.append(j)
                users.append(u)
                labels.append(0)

        sess_number = np.ones(len(labels))

        users = np.array(users)
        items = np.array(cand_venues)
        sess_item = np.array(checkins)
        labels = np.array(labels)

        feature_dict = {'user': users, 'item': items, 'score': labels, 'sess_0_item': sess_item}

        fixlen_feature_names = get_fixlen_feature_names(self.feature_columns)
        varlen_feature_names = get_varlen_feature_names(self.feature_columns)
        x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in
                                                                     varlen_feature_names]
        x += [sess_number]

        return x, labels
Esempio n. 9
0
def get_xy_fd(hash_flag=False):

    feature_columns = [SparseFeat('user', 3, hash_flag),
                       SparseFeat('gender', 2, hash_flag),
                       SparseFeat('item', 3 + 1, hash_flag),
                       SparseFeat('item_gender', 2 + 1, hash_flag),
                       DenseFeat('score', 1)]
    feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')]
    feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score,
                    'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, }

    fixlen_feature_names = get_fixlen_feature_names(feature_columns)
    varlen_feature_names = get_varlen_feature_names(feature_columns)
    x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names]


    x += [sess_number]

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Esempio n. 10
0
def main(args):

    if args.arch == 'xDeepFM':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data',
                                'train_data')
        item = pd.read_csv(csv_file,
                           dtype={
                               'article_id': str,
                               'hh': int,
                               'gender': str,
                               'age_range': str,
                               'read_article_ids': str
                           },
                           sep='\t')
        label_data_path = os.path.join(
            DATASET_PATH, 'train',
            os.path.basename(os.path.normpath(csv_file)).split('_')[0] +
            '_label')
        label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t')
        item['label'] = label
        s = time.time()
        print(f'before test article preprocess : {len(item)}')

        sparse_features = [
            'article_id', 'hh', 'gender', 'age_range', 'len_bin'
        ]
        dense_features = ['image_feature', 'read_cnt_prob']
        target = ['label']

        ############################ make more feature !!!!!!! #################################
        ############## 1. read_article_ids len cnt -- user feature #################################################
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)

        item['len'] = len_lis
        item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop')

        id_to_artic = dict()
        artics = item['article_id'].tolist()

        ################ 2. read_cnt, total_cnt, prob_read_cnt --- article feature ####################################
        read_cnt = item[item['label'] == 1].groupby('article_id').agg(
            {'hh': 'count'})
        read_cnt = read_cnt.reset_index()
        read_cnt = read_cnt.rename(columns={'hh': 'read_cnt'})

        read_cnt_list = read_cnt['read_cnt'].tolist()
        read_cnt_artic_list = read_cnt['article_id'].tolist()
        print(f'len read_cnt : {len(read_cnt)}')
        print(read_cnt.head(3))

        total_cnt = item.groupby('article_id').agg({'hh': 'count'})
        total_cnt = total_cnt.reset_index()
        total_cnt = total_cnt.rename(columns={'hh': 'read_cnt'})
        total_cnt_list = total_cnt['read_cnt'].tolist()
        total_cnt_artic_list = total_cnt['article_id'].tolist()
        print(f'len read_cnt : {len(total_cnt)}')
        print(total_cnt.head(3))

        # lit # test_article_ids list
        lit_cnt = []
        lit_total_cnt = []
        lit_cnt_prob = []
        lit = list(set(artics))
        lit.sort()
        print(lit[:10])
        print(f'len(lit):{len(lit)}')
        for i in range(len(lit)):
            # lit_cnt
            cur_artic = lit[i]
            if cur_artic not in read_cnt_artic_list:
                lit_cnt.append(0)
            else:
                for j in range(len(read_cnt_artic_list)):
                    if cur_artic == read_cnt_artic_list[j]:
                        lit_cnt.append(read_cnt_list[j])
                        break
            # lit_total_cnt
            if cur_artic not in total_cnt_artic_list:
                lit_total_cnt.append(0)
            else:
                for j in range(len(total_cnt_artic_list)):
                    if cur_artic == total_cnt_artic_list[j]:
                        lit_total_cnt.append(total_cnt_list[j])
                        break
            # lit_cnt_prob
            if lit_total_cnt[i] == 0:
                lit_cnt_prob.append(0)
            else:
                lit_cnt_prob.append(lit_cnt[i] / lit_total_cnt[i])
        print('--- read_cnt article feature completed ---')
        print(f'lit_cnt {len(lit_cnt)}')
        print(f'lit_total_cnt {len(lit_total_cnt)}')
        print(f'lit_cnt_prob {len(lit_cnt_prob)}')

        #### fea
        print('feature dict generate')
        file_list1 = os.listdir(DATASET_PATH)
        file_list2 = os.listdir(DATASET_PATH + '/train')
        file_list3 = os.listdir(DATASET_PATH + '/train/train_data')

        print(file_list1)
        print(file_list2)
        print(file_list3)
        resnet_feature_extractor(args.mode)

        print(file_list1)
        print(file_list2)
        print(file_list3)

        # One hot Encoding
        with open(os.path.join('train_image_features_50.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)

        print('check artic feature')
        print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}")

        lbe = LabelEncoder()
        lbe.fit(lit)
        item['article_id' + '_onehot'] = lbe.transform(item['article_id'])
        print(lbe.classes_)

        for feat in sparse_features[1:]:
            lbe = LabelEncoder()
            item[feat + '_onehot'] = lbe.fit_transform(
                item[feat])  # 이때 고친 라벨이 같은 라벨인지도 필수로 확인해야함

        print(item.head(10))
        print('columns name : ', item.columns)
        fixlen_feature_columns = [SparseFeat('article_id', len(lit))]
        fixlen_feature_columns += [
            SparseFeat(feat, item[feat + '_onehot'].nunique())
            for feat in sparse_features[1:]
        ]
        fixlen_feature_columns += [
            DenseFeat('image_feature', len(image_feature_dict[artics[0]]))
        ]
        fixlen_feature_columns += [DenseFeat('read_cnt_prob', 1)]

        print(f'fixlen_feature_columns : {fixlen_feature_columns}')
        idx_artics_all = item['article_id' + '_onehot'].tolist()

        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]

        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns
        fixlen_feature_names = get_fixlen_feature_names(
            linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
        print('---model defined---')
        print(time.time() - s, 'seconds')

        ##### print need

        for artic in lit:
            print(artic, end=',')
        print()
        print('new')
        print()

        print(len(lit_cnt_prob))
        for prob in lit_cnt_prob:
            prob = round(prob, 4)
            print(prob, end=',')
        print()
        print('end')
        print('--------------')

    optimizer = tf.keras.optimizers.Adam(args.lr)
    s = time.time()

    # negative sampling
    item_pos = item[item['label'] == 1]
    item_neg = item[item['label'] == 0]

    dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42)
    dn_2 = item_neg.sample(n=3 * len(item_pos), random_state=20)
    dn_3 = item_neg.sample(n=3 * len(item_pos), random_state=7)
    dn_4 = item_neg.sample(n=3 * len(item_pos), random_state=33)
    dn_5 = item_neg.sample(n=3 * len(item_pos), random_state=41)

    dn_1.reset_index()

    data_1 = pd.concat([dn_1, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_1_article_idxs = data_1['article_id_onehot'].tolist()
    data_1_article = data_1['article_id'].tolist()
    print(f'len data_1 : {len(data_1)}')
    print(data_1.head(5))
    li1 = []
    li2 = []
    li3 = []
    for i in range(len(data_1_article)):
        for j in range(len(lit_cnt_prob)):
            if data_1_article[i] == lit[j]:
                li3.append(lit_cnt_prob[j])
                break
    data_1['read_cnt_prob'] = li3
    print('---read_cnt_prob end---')
    ## preprocess append

    data_2 = pd.concat([dn_2, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_3 = pd.concat([dn_3, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_4 = pd.concat([dn_4, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    data_5 = pd.concat([dn_5, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()

    li = []
    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)
    print(f'article_id : {data_1_article[0]}')
    print(f'article_image_feature : {image_feature_dict[data_1_article[0]]}')

    data_1['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')

    if use_nsml:
        bind_nsml(model, optimizer, args.task)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train') or args.dry_run:
        best_loss = 1000
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')

        model.compile(
            tf.keras.optimizers.Adam(args.lr),
            'mse',
            metrics=['accuracy'],
        )
        train_generator = data_generator(data_1)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

        #k_fold 할때는 check point 빼자
        save_cbk = CustomModelCheckpoint()

        history = model.fit_generator(train_generator,
                                      epochs=100,
                                      verbose=2,
                                      workers=8,
                                      steps_per_epoch=np.ceil(
                                          len(data_1) / 2048),
                                      callbacks=[lr_scheduler, save_cbk])
        print('again')
        data.loc[data.shape[0] + 1] = row
    sparse_features = ["user_id", "item_id", "item_category", "time"]
    target = ['behavior_type']
    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    lbe_store = {}
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
        lbe_store[feat] = lbe
    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns +
                                                    dnn_feature_columns)

    # 3.generate input data for model
    data = data.sample(frac=0.001)
    train, test = train_test_split(data, test_size=0.2)
    # train = train[:1000]
    # test = test[:200]
    train_model_input = [train[name].values for name in fixlen_feature_names]
    test_model_input = [test[name].values for name in fixlen_feature_names]
    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   task='regression')
    model.compile(
        "adam",
        "mse",
Esempio n. 12
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature']
        target = ['label']
        
        
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        
        with open(os.path.join(DATASET_PATH, 'train', 'train_data', 'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
        
        
        
        idx_artics_all = item['article_id'].tolist()
        
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        
       
            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        print('_infer root - : ', DATASET_PATH)
        print('test')
        model, fixlen_feature_names_global, item, image_feature_dict, id_to_artic = get_item(DATASET_PATH)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['401','team_62/airush2/176']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        print('successfully loaded')

    if (args.mode == 'train'):
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    print('end_main')

    if args.pause:
        nsml.paused(scope=local)
Esempio n. 13
0
def get_item(root):
    print('load')
    csv_file = os.path.join(root, 'test', 'test_data', 'test_data')
    item = pd.read_csv(csv_file,
                dtype={
                    'article_id': str,
                    'hh': int, 'gender': str,
                    'age_range': str,
                    'read_article_ids': str
                }, sep='\t')
    print('loaded!!')
    sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
    dense_features = ['image_feature']
    target = ['label']

    len_lis = []

    read_article_ids_all = item['read_article_ids'].tolist()
    for i in range(len(item)):
        li = read_article_ids_all[i]
        if type(li) == float:
            len_lis.append(0)
            continue
        len_li = len(li.split(','))
        len_lis.append(len_li)
    
    
    item['len']  = len_lis
    item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')

    id_to_artic = dict()
    artics = item['article_id'].tolist()
    
    with open(os.path.join(DATASET_PATH, 'test', 'test_data', 'test_image_features.pkl'), 'rb') as handle:
        image_feature_dict = pickle.load(handle)

    print('image_feaeture_dict loaded..')
    for feat in sparse_features:
        lbe = LabelEncoder()
        item[feat] = lbe.fit_transform(item[feat])

    # test set으로 구성해도 되고 item 을..
    fixlen_feature_columns = []
    for feat in sparse_features:
        if feat == 'article_id':
            fixlen_feature_columns.append(SparseFeat(feat,1896))
        else:
            fixlen_feature_columns.append(SparseFeat(feat,item[feat].nunique()))
    #fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
    fixlen_feature_columns += [DenseFeat(feat,len(image_feature_dict[artics[0]])) for feat in dense_features]
    
    print(fixlen_feature_columns)
    
    
    idx_artics_all = item['article_id'].tolist()
    
    for i in range(len(artics)):
        idx_artic = idx_artics_all[i]
        if idx_artic not in id_to_artic.keys():
            id_to_artic[idx_artic] = artics[i]
    
    
       
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns  
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
    
    fixlen_feature_names_global = fixlen_feature_names

    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
    #bind_nsml(model, list(), args.task)

    return model, fixlen_feature_names_global, item,image_feature_dict, id_to_artic
Esempio n. 14
0
import numpy as np
from deepctr.models import DIN
from deepctr.inputs import SparseFeat,VarLenSparseFeat,DenseFeat,get_fixlen_feature_names,get_varlen_feature_names

feature_columns = [SparseFeat('user',3),SparseFeat(
    'gender', 2), SparseFeat('item', 3 + 1), SparseFeat('item_gender', 2 + 1),DenseFeat('score', 1)]
feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                    VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]
behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3])  # 0 is mask value
igender = np.array([1, 2, 1])  # 0 is mask value
score = np.array([0.1, 0.2, 0.3])

hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}

fixlen_feature_names = get_fixlen_feature_names(feature_columns)
varlen_feature_names = get_varlen_feature_names(feature_columns)
x = [feature_dict[name] for name in fixlen_feature_names] + [feature_dict[name] for name in varlen_feature_names]

y = [1, 0, 1]

model = DIN(feature_columns, behavior_feature_list, hist_len_max=4, )
model.compile('adam', 'binary_crossentropy',
              metrics=['binary_crossentropy'])
history = model.fit(x, y, verbose=1, epochs=10, validation_split=0.5)
Esempio n. 15
0
def main(args):
    if args.arch == 'MLP':
        model = get_mlp(num_classes=args.num_classes)
    elif args.arch == 'Resnet':
        model = get_resnet18(num_classes=args.num_classes)
    elif args.arch == 'xDeepFM':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data',
                                'train_data')
        item = pd.read_csv(csv_file,
                           dtype={
                               'article_id': str,
                               'hh': int,
                               'gender': str,
                               'age_range': str,
                               'read_article_ids': str
                           },
                           sep='\t')
        label_data_path = os.path.join(
            DATASET_PATH, 'train',
            os.path.basename(os.path.normpath(csv_file)).split('_')[0] +
            '_label')
        label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t')
        item['label'] = label
        print(len(item))
        sparse_features = [
            'article_id', 'hh', 'gender', 'age_range', 'len_bin'
        ]
        dense_features = ['image_feature']
        target = ['label']
        print(time.time() - s, 'seconds')
        s = time.time()
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        print(f'read_article_ids_all len : {len(read_article_ids_all)}')
        """
        def extract_len_read_article(read_article_ids):
            if type(read_article_ids) == float:
                return 0
            else :
                return len(read_article_ids.split(','))
        read_article_ids_all = item['read_article_ids'].tolist()
        with Pool(processes=6) as p:
            len_lis = list(tqdm(p.imap(extract_len_read_article, read_article_ids_all), total=len(read_article_ids_all)))
        """
        item['len'] = len_lis
        item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop')
        print('len_bin finished ', time.time() - s, 'seconds')
        id_to_artic = dict()
        artics = item['article_id'].tolist()

        with open(
                os.path.join(DATASET_PATH, 'train', 'train_data',
                             'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [
            SparseFeat(feat, item[feat].nunique()) for feat in sparse_features
        ]
        fixlen_feature_columns += [
            DenseFeat(feat, len(image_feature_dict[artics[0]]))
            for feat in dense_features
        ]
        print(artics[0])
        print(fixlen_feature_columns)
        """
        [SparseFeat(name='article_id', dimension=1896, use_hash=False, dtype='int32', embedding_name='article_id', embedding=True), SparseFeat(name='hh', dimension=24, use_hash=False, dtype='int32', embedding_name='hh', embedding=True), SparseFeat(name='gender', dimension=2, use_hash=False, dtype='int32', embedding_name='gender', embedding=True), SparseFeat(name='age_range', dimension=9, use_hash=False, dtype='int32', embedding_name='age_range', embedding=True), SparseFeat(name='len_bin', dimension=5, use_hash=False, dtype='int32', embedding_name='len_bin', embedding=True), DenseFeat(name='image_feature', dimension=2048, dtype='float32')]
        
        """
        print('---fixlen_feature_columns finished---')
        s = time.time()
        idx_artics_all = item['article_id'].tolist()
        print(f'idx_artics_all len : {len(idx_artics_all)}')
        print(f'artics len : {len(artics)}')
        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]
        print(f'id_to_artic len : {len(id_to_artic)}')
        print(time.time() - s, 'seconds')
        #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns
        fixlen_feature_names = get_fixlen_feature_names(
            linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
    """
    if args.use_gpu:
        model = model.cuda()
    else:
        model = model.cpu()

    """
    optimizer = tf.keras.optimizers.Adam(args.lr)

    # negative sampling
    item_pos = item[item['label'] == 1]
    item_neg = item[item['label'] == 0]
    print(f'len item_pos : {len(item_pos)}')
    print(f'len item_neg : {len(item_neg)}')

    dn_1 = item_neg.sample(n=2 * len(item_pos), random_state=42)
    dn_1.reset_index()
    print(f'len dn_1 : {len(dn_1)}')

    data_1 = pd.concat([dn_1, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()
    print(f'len data_1 : {len(data_1)}')
    print('--- negative sampling completed ---')

    s = time.time()
    data_1_article_idxs = data_1['article_id'].tolist()
    li = []
    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)

    print(f'len image_feature : {len(li)}')
    data_1['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')

    print(f'generate all x_train')

    if use_nsml:
        bind_nsml(model, optimizer, args.task)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train') or args.dry_run:

        best_loss = 1000
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네
        model.compile(
            tf.keras.optimizers.Adam(args.lr),
            'mse',
            metrics=['accuracy'],
        )
        train_generator = data_generator(data_1)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

        save_cbk = CustomModelCheckpoint()

        history = model.fit_generator(train_generator,
                                      epochs=200,
                                      verbose=2,
                                      workers=8,
                                      steps_per_epoch=np.ceil(
                                          len(data_1) / 2048),
                                      callbacks=[lr_scheduler, save_cbk])
        print('again')
        """
Esempio n. 16
0
def main(args, local):
    
    if args.arch == 'xDeepFM' and args.mode == 'train':


        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data', 'train_data')
        item = pd.read_csv(csv_file,
                    dtype={
                        'article_id': str,
                        'hh': int, 'gender': str,
                        'age_range': str,
                        'read_article_ids': str
                    }, sep='\t')
        label_data_path = os.path.join(DATASET_PATH, 'train',
                                os.path.basename(os.path.normpath(csv_file)).split('_')[0] + '_label')
        label = pd.read_csv(label_data_path,
                    dtype={'label': int},
                    sep='\t')
        item['label']  = label
        s = time.time()
        #print(f'before test article preprocess : {len(item)}')
        
        #print(f'after test  article preprocess : {len(item)}')
        #print(f'time : {time.time() - s}')

        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature', 'read_cnt_prob']
        target = ['label']
        
        ############################ make more feature !!!!!!! #################################
        ############## 1. read_article_ids len cnt -- user feature #################################################
        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)
        
        
        item['len']  = len_lis
        item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')
    
        id_to_artic = dict()
        artics = item['article_id'].tolist()
        

        #print(item.head(3))
        #print('columns name : ',item.columns)
        sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
        dense_features = ['image_feature', 'read_cnt_prob']
        
        fixlen_feature_columns = [SparseFeat(feat, item[feat].nunique()) for feat in sparse_features]
        fixlen_feature_columns += [DenseFeat('image_feature',2048)]
        fixlen_feature_columns += [DenseFeat('read_cnt_prob',1)]
        
        #print(f'fixlen_feature_columns : {fixlen_feature_columns}')
 
        
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns  
        fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'regression')
        print('---model defined---')
        #print(time.time() - s ,'seconds')


    if use_nsml and args.mode == 'train':

        bind_nsml(model,[], args.task)
    
    
    if args.mode == 'test':
        #print('_infer root - : ', DATASET_PATH)
        #print('test')
        #print('DATASET_PATH: ', DATASET_PATH)
        file_list= glob.glob(f'{DATASET_PATH}/test/test_data/*')
        #print('file_list: ',file_list)
        model, fixlen_feature_names_global, item, image_feature_dict,lit,lit_cnt_prob = get_item(DATASET_PATH,args.mode)
        bind_nsml(model, [], args.task)
        checkpoint_session = ['3','team_62/airush2/361']
        nsml.load(checkpoint = str(checkpoint_session[0]), session = str(checkpoint_session[1])) 
        #print('successfully loaded')

    if (args.mode == 'train'):
        #print('DATASET_PATH: ', DATASET_PATH)
        #file_list= glob.glob(f'{DATASET_PATH}/train/train_data/*')
        #print('file_list :',file_list)
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네

        nsml.save('infer')
        print('end')
    #print('end_main')

    if args.pause:
        nsml.paused(scope=local)
Esempio n. 17
0
def get_item(root, phase):
    #print('load')
    csv_file = os.path.join(root, 'test', 'test_data', 'test_data')
    item = pd.read_csv(csv_file,
                dtype={
                    'article_id': str,
                    'hh': int, 'gender': str,
                    'age_range': str,
                    'read_article_ids': str
                }, sep='\t')
    #print('loaded!!')
    sparse_features = ['article_id', 'hh','gender','age_range','len_bin']
    dense_features = ['image_feature', 'read_cnt_prob']
        

  
    global lit_cnt_prob_list
    lit_cnt_prob_list = lit_cnt_prob_list.replace(' ','')
    lit_cnt_prob_list = lit_cnt_prob_list.replace('\n','')
    lit_cnt_prob = lit_cnt_prob_list.split(',')


    len_lis = []

    read_article_ids_all = item['read_article_ids'].tolist()
    for i in range(len(item)):
        li = read_article_ids_all[i]
        if type(li) == float:
            len_lis.append(0)
            continue
        len_li = len(li.split(','))
        len_lis.append(len_li)
    
    
    item['len']  = len_lis
    item['len_bin']  = pd.qcut(item['len'],6,duplicates='drop')


    artics = item['article_id'].tolist()
    lit = list(set(artics))
    lit.sort()
    print(f'len lit : {len(lit)}')
    #### fea
    #print('feature dict generate')
    #resnet_feature_extractor('test')

    with open(os.path.join('/data/airush2/test/test_data/test_image_features.pkl'), 'rb') as handle:
        image_feature_dict = pickle.load(handle)
    print('image_feaeture_dict loaded..')
    print('check artic feature')
    print(f"757518f4a3da : {image_feature_dict['757518f4a3da']}")
    
    
    lbe = LabelEncoder()
    lbe.fit(lit)
    item['article_id' + '_onehot'] = lbe.transform(item['article_id'])

    for feat in sparse_features[1:]:
        lbe = LabelEncoder()
        item[feat + '_onehot'] = lbe.fit_transform(item[feat])

    
    #print('----- after onehot encoding -----')
    #print(item.head(10))
    # test set으로 구성해도 되고 item 을..

    fixlen_feature_columns = [SparseFeat('article_id',1896)]
    fixlen_feature_columns += [SparseFeat(feat, item[feat +'_onehot'].nunique()) for feat in sparse_features[1:]]
    fixlen_feature_columns += [DenseFeat('image_feature',len(image_feature_dict[artics[0]]))]
    fixlen_feature_columns += [DenseFeat('read_cnt_prob',1)]
    
    #print(fixlen_feature_columns)
    
    
    idx_artics_all = item['article_id'].tolist()
    
       
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns  
    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)
    
    fixlen_feature_names_global = fixlen_feature_names

    model = xDeepFM(linear_feature_columns, dnn_feature_columns, task= 'binary')
    #bind_nsml(model, list(), args.task)

    return model, fixlen_feature_names_global, item,image_feature_dict, lit, lit_cnt_prob
Esempio n. 18
0
def main(args):

    if args.arch == 'xDeepFM':
        s = time.time()
        csv_file = os.path.join(DATASET_PATH, 'train', 'train_data',
                                'train_data')
        item = pd.read_csv(csv_file,
                           dtype={
                               'article_id': str,
                               'hh': int,
                               'gender': str,
                               'age_range': str,
                               'read_article_ids': str
                           },
                           sep='\t')
        label_data_path = os.path.join(
            DATASET_PATH, 'train',
            os.path.basename(os.path.normpath(csv_file)).split('_')[0] +
            '_label')
        label = pd.read_csv(label_data_path, dtype={'label': int}, sep='\t')
        item['label'] = label

        sparse_features = [
            'article_id', 'hh', 'gender', 'age_range', 'len_bin'
        ]
        dense_features = ['image_feature']
        target = ['label']

        len_lis = []

        read_article_ids_all = item['read_article_ids'].tolist()
        for i in range(len(item)):
            li = read_article_ids_all[i]
            if type(li) == float:
                len_lis.append(0)
                continue
            len_li = len(li.split(','))
            len_lis.append(len_li)

        item['len'] = len_lis
        item['len_bin'] = pd.qcut(item['len'], 6, duplicates='drop')

        id_to_artic = dict()
        artics = item['article_id'].tolist()

        with open(
                os.path.join(DATASET_PATH, 'train', 'train_data',
                             'train_image_features.pkl'), 'rb') as handle:
            image_feature_dict = pickle.load(handle)
        for feat in sparse_features:
            lbe = LabelEncoder()
            item[feat] = lbe.fit_transform(item[feat])
        fixlen_feature_columns = [
            SparseFeat(feat, item[feat].nunique()) for feat in sparse_features
        ]
        fixlen_feature_columns += [
            DenseFeat(feat, len(image_feature_dict[artics[0]]))
            for feat in dense_features
        ]

        idx_artics_all = item['article_id'].tolist()

        for i in range(len(artics)):
            idx_artic = idx_artics_all[i]
            if idx_artic not in id_to_artic.keys():
                id_to_artic[idx_artic] = artics[i]

            #image_feature_dict[article_id] 로 가져오면 되니까 일단 패스
        linear_feature_columns = fixlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns
        fixlen_feature_names = get_fixlen_feature_names(
            linear_feature_columns + dnn_feature_columns)
        print(fixlen_feature_names)
        global fixlen_feature_names_global
        fixlen_feature_names_global = fixlen_feature_names
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='regression')
        print('---model defined---')
        # 만들었던 파일들 저장하는 것도 하나 짜기, 매번 돌릴 수 없으니까
        print(time.time() - s, 'seconds')

    optimizer = tf.keras.optimizers.Adam(args.lr)
    s = time.time()

    # negative sampling
    item_pos = item[item['label'] == 1]
    item_neg = item[item['label'] == 0]

    dn_1 = item_neg.sample(n=3 * len(item_pos), random_state=42)
    dn_1.reset_index()

    data_1 = pd.concat([dn_1, item_pos]).sample(frac=1,
                                                random_state=42).reset_index()

    data_1_article_idxs = data_1['article_id'].tolist()
    li = []
    for i in range(len(data_1_article_idxs)):
        image_feature = image_feature_dict[id_to_artic[data_1_article_idxs[i]]]
        li.append(image_feature)

    data_1['image_feature'] = li
    li = []
    print(f'finished data_1_image_feature : {time.time() - s} sec')

    if use_nsml:
        bind_nsml(model, optimizer, args.task)
    if args.pause:
        nsml.paused(scope=locals())

    if (args.mode == 'train') or args.dry_run:
        best_loss = 1000
        if args.dry_run:
            print('start dry-running...!')
            args.num_epochs = 1
        else:
            print('start training...!')
        # 미리 전체를 다 만들어놓자 굳이 generator 안써도 되겠네
        model.compile(
            tf.keras.optimizers.Adam(args.lr),
            'mse',
            metrics=['accuracy'],
        )
        train_generator = data_generator(data_1)
        lr_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)

        save_cbk = CustomModelCheckpoint()

        history = model.fit_generator(train_generator,
                                      epochs=100,
                                      verbose=2,
                                      workers=8,
                                      steps_per_epoch=np.ceil(
                                          len(data_1) / 2048),
                                      callbacks=[lr_scheduler, save_cbk])
        print('again')