def main():
    ########################################################################
    # use hyperopt to find the best parameters of the model
    # use 5 fold cross validation
    learner_name = 'vanila_ensemble'
    print(learner_name)
    logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name,
                                                time_utils._timestamp())
    logger = logging_utils._get_logger('Log', logname)
    logger.info('start')

    optimizer = TaskOptimizer(learner_name, logger)
    optimizer.run()
def main():
    feature_vectorized_file_name = 'Data/feature_vectorized2'
    if os.path.exists(feature_vectorized_file_name) == False:
        sparse_merge, price = _load(feature_vectorized_file_name)
        print(sparse_merge.shape)
    else:
        ########################################################################
        start_time = time.time()
        merge, submission, price = get_extract_feature()
        merge = merge[:TRAIN_SIZE]

        #merge['item_condition_id'] = merge['item_condition_id'].astype('category')
        # print('[{}] Convert categorical completed'.format(time.time() - start_time))
        #
        # # vectorize features
        # wb = CountVectorizer()
        # X_category2 = wb.fit_transform(merge['category_2'])
        # X_category3 = wb.fit_transform(merge['category_name'])
        # X_brand2 = wb.fit_transform(merge['brand_name'])
        # print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))
        #
        # lb = LabelBinarizer(sparse_output=True)
        # X_brand = lb.fit_transform(merge['brand_name'])
        # X_category1 = lb.fit_transform(merge['category_1'])
        # X_category4 = lb.fit_transform(merge['category_name'])
        # print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))
        #
        # X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values)
        #
        # # hand feature
        # for col in merge.columns:
        #     if ('Len' in col) or ('Frec' in col):
        #         merge[col] = np.log1p(merge[col])
        #         merge[col] = merge[col] / merge[col].max()
        #
        # hand_feature = ['brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct',
        #                 'brand_name_item_description_Intsct']
        # X_hand_feature = merge[hand_feature].values
        #
        name_w1 = param_space_best_WordBatch['name_w1']
        name_w2 = param_space_best_WordBatch['name_w2']
        desc_w1 = param_space_best_WordBatch['desc_w1']
        desc_w2 = param_space_best_WordBatch['desc_w2']
        #
        # wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, {
        #     "hash_ngrams": 2,
        #     "hash_ngrams_weights": [name_w1, name_w2],
        #     "hash_size": 2 ** 28,
        #     "norm": None,
        #     "tf": 'binary',
        #     "idf": None,
        # }), procs=8)
        # wb.dictionary_freeze = True
        # X_name = wb.fit_transform(merge['name'])
        # del (wb)
        # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1), dtype=bool)]
        # print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))


        merge['item_description'] = merge['category_2'].map(str)+' E '+\
                                    merge['name'].map(str)+' E '+\
                                    merge['item_description'].map(str)

        wb = wordbatch.WordBatch(normalize_text=None,
                                 extractor=(WordBag, {
                                     "hash_ngrams":
                                     3,
                                     "hash_ngrams_weights":
                                     [desc_w1, desc_w2, 0.7],
                                     "hash_size":
                                     2**28,
                                     "norm":
                                     "l2",
                                     "tf":
                                     1.0,
                                     "idf":
                                     None
                                 }),
                                 procs=8)
        wb.dictionary_freeze = True
        X_description = wb.fit_transform(merge['item_description'])
        del (wb)
        X_description = X_description[:,
                                      np.array(np.clip(
                                          X_description.getnnz(axis=0) -
                                          6, 0, 1),
                                               dtype=bool)]
        print(
            '[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                  start_time))
        print(X_description.shape)

        sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1,
                               X_category2, X_category3, X_category4,
                               X_hand_feature, X_name, X_description)).tocsr()

        print(X_dummies.shape, X_brand.shape, X_brand2.shape,
              X_category1.shape, X_category2.shape, X_category3.shape,
              X_category4.shape, X_hand_feature.shape, X_name.shape,
              X_description.shape, sparse_merge.shape)

        _save(feature_vectorized_file_name, [sparse_merge, price])
        print('[{}] data saved.'.format(time.time() - start_time))

    ########################################################################
    # use hyperopt to find the best parameters of the model
    # use 3 fold cross validation

    # learner_name='best_FTRL'
    # learner_name='FTRL'
    learner_name = 'best_FM_FTRL'
    #learner_name='FM_FTRL'
    print(learner_name)
    logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name,
                                                time_utils._timestamp())
    logger = logging_utils._get_logger('Log', logname)
    logger.info('start')

    optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger)
    optimizer.run()

    a = 12
def main():
    feature_vectorized_file_name = 'Data/feature_vectorized'
    if os.path.exists(feature_vectorized_file_name) == False:
        sparse_merge, price, desc, name = _load(feature_vectorized_file_name)
        print(sparse_merge.shape)
    else:
        ########################################################################
        start_time = time.time()
        merge, submission, price = get_extract_feature()
        merge = merge[:TRAIN_SIZE]

        merge['item_condition_id'] = merge['item_condition_id'].astype(
            'category')
        print('[{}] Convert categorical completed'.format(time.time() -
                                                          start_time))

        brands_filling(merge)

        merge['gencat_cond'] = merge['category_1'].map(
            str) + '_' + merge['item_condition_id'].astype(str)
        merge['subcat_1_cond'] = merge['category_2'].map(
            str) + '_' + merge['item_condition_id'].astype(str)
        merge['subcat_2_cond'] = merge['category_name'].map(
            str) + '_' + merge['item_condition_id'].astype(str)
        print(
            f'[{time.time() - start_time}] Categories and item_condition_id concancenated.'
        )

        # vectorize features
        wb = CountVectorizer()
        X_category2 = wb.fit_transform(merge['category_2'])
        X_category3 = wb.fit_transform(merge['category_name'])
        X_brand2 = wb.fit_transform(merge['brand_name'])
        wb = CountVectorizer(token_pattern='.+', min_df=2)
        X_gencat_cond = wb.fit_transform(merge['gencat_cond'])
        X_subcat_1_cond = wb.fit_transform(merge['subcat_1_cond'])
        X_subcat_2_cond = wb.fit_transform(merge['subcat_2_cond'])
        print(
            '[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                  start_time))

        lb = LabelBinarizer(sparse_output=True)
        X_brand = lb.fit_transform(merge['brand_name'])
        X_category1 = lb.fit_transform(merge['category_1'])
        X_category4 = lb.fit_transform(merge['category_name'])
        print(
            '[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                                 start_time))

        X_dummies = csr_matrix(
            pd.get_dummies(merge[['item_condition_id', 'shipping']],
                           sparse=True).values)

        # hand feature
        columns = ['brand_name_Frec', 'item_description_wordLen']
        for col in columns:
            merge[col] = np.log1p(merge[col])
            merge[col] = merge[col] / merge[col].max()

        hand_feature = [
            'brand_name_Frec', 'item_description_wordLen',
            'brand_name_name_Intsct', 'brand_name_item_description_Intsct',
            'has_brand'
        ]

        X_hand_feature = merge[hand_feature].values

        sparse_merge = hstack(
            (X_dummies, X_brand, X_brand2, X_category1, X_category2,
             X_category3, X_category4, X_hand_feature, X_gencat_cond,
             X_subcat_1_cond, X_subcat_2_cond)).tocsr()

        print(X_dummies.shape, X_brand.shape, X_brand2.shape,
              X_category1.shape, X_category2.shape, X_category3.shape,
              X_category4.shape, X_hand_feature.shape, sparse_merge.shape,
              X_gencat_cond.shape, X_subcat_1_cond.shape,
              X_subcat_2_cond.shape)


        merge['item_description'] = merge['category_2'].map(str)+' . . '+\
                                    merge['name'].map(str)+' . . '+\
                                    merge['item_description'].map(str)

        desc = merge['item_description']
        name = merge['name']

        _save(feature_vectorized_file_name, [sparse_merge, price, desc, name])
        print('[{}] data saved.'.format(time.time() - start_time))

    ########################################################################
    # use hyperopt to find the best parameters of the model
    learner_name = 'best_WordBatch'
    #learner_name='WordBatch'
    print(learner_name)
    logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name,
                                                time_utils._timestamp())
    logger = logging_utils._get_logger('Log', logname)
    logger.info('start')

    optimizer = TaskOptimizer(learner_name, sparse_merge, price, desc, name,
                              logger)
    optimizer.run()
Beispiel #4
0
def main():
    ########################################################################
    file_name = 'Data/feature_processed'
    if os.path.exists(file_name) == False:
        merge, Item_size, hand_feature = _load('Data/feature_processed')
        print(hand_feature)
    else:
        start_time = time.time()
        merge, submission, y = get_extract_feature()

        merge['item_condition_id'] = merge['item_condition_id'].astype(
            'category')
        print('[{}] Convert categorical completed'.format(time.time() -
                                                          start_time))

        Item_size = {}
        # Label_Encoder brand_name + category
        columns = ['category_1', 'category_2', 'category_name', 'brand_name']
        p = multiprocessing.Pool(4)
        dfs = p.imap(Label_Encoder, [merge[col] for col in columns])
        for col, df in zip(columns, dfs):
            merge[col] = df
            Item_size[col] = merge[col].max() + 1
        print('[{}] Label Encode `brand_name` and `categories` completed.'.
              format(time.time() - start_time))

        # sequance item_description,name
        columns = ['item_description', 'name']
        p = multiprocessing.Pool(4)
        dfs = p.imap(Item_Tokenizer, [merge[col] for col in columns])
        for col, df in zip(columns, dfs):
            merge['Seq_' + col], Item_size[col] = df
        print('[{}] sequance `item_description` and `name` completed.'.format(
            time.time() - start_time))
        print(Item_size)

        # hand feature
        columns = ['brand_name_Frec', 'item_description_wordLen']
        for col in columns:
            merge[col] = np.log1p(merge[col])
            merge[col] = merge[col] / merge[col].max()

        hand_feature = [
            'brand_name_Frec', 'item_description_wordLen',
            'brand_name_name_Intsct', 'brand_name_item_description_Intsct'
        ]

        _save(file_name, [merge, Item_size, hand_feature])
        print('[{}] data saved.'.format(time.time() - start_time))

    ########################################################################
    # use hyperopt to find the best parameters of the model
    # use 3 fold cross validation

    #learner_name = 'best_GRU_ensemble'
    # learner_name = 'GRU_ensemble'
    #learner_name = 'best_con1d_ensemble'
    # learner_name = 'con1d_ensemble'
    learner_name = 'best_vanila_con1d'
    # learner_name='best_vanila_GRU'
    #learner_name='vanila_con1d'
    # learner_name='vanila_GRU'
    print(learner_name)
    logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name,
                                                time_utils._timestamp())
    logger = logging_utils._get_logger('Log', logname)
    logger.info('start')

    optimizer = TaskOptimizer(learner_name, merge, Item_size, hand_feature,
                              logger)
    optimizer.run()
Beispiel #5
0
#coding: utf-8

import os
import sys

import pkgutil
import time_utils
import logging_utils

import config

feature_name = "stack1"

logname = "feature_combiner_%s_%s.log" % (feature_name,
                                          time_utils._timestamp())
logger = logging_utils._get_logger(config.LOG_DIR, logname)

data_dict = {
    "train_basic": "newdata/train_v20.csv",
    "train_files": [],
    "test_files": [],
}

fname = os.path.join(config.FEAT_DIR + "/Combine",
                     feature_name + config.FEAT_FILE_SUFFIX)
pkl_utils._save(fname, data_dict)
logger.info("Save to %s" % fname)