def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    X_name = wb.fit_transform(merge['name'])
    del(wb)
    X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                                  "idf": None})
                             , procs=8)
    wb.dictionary_freeze= True
    X_description = wb.fit_transform(merge['item_description'])
    del(wb)
    X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() - start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

    model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.6,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        watchlist = [d_train, d_valid]

    model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    if develop:
        preds = model.predict(valid_X)
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)

    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    dev_preds = []
    test_preds = []
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    X_name = wb.fit_transform(merge['name'])
    del(wb)
    X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                                  "idf": None})
                             , procs=8)
    wb.dictionary_freeze= True
    X_description = wb.fit_transform(merge['item_description'])
    del(wb)
    X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() - start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

    model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
        dev_preds.append(preds)
    predsF = model.predict(X_test)
    test_preds.append(predsF)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
        dev_preds.append(preds)

    predsFM = model.predict(X_test)
    test_preds.append(predsFM)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.6,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    '''
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)
    '''
    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        watchlist = [d_train, d_valid]

    model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    if develop:
        preds = model.predict(valid_X)
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
        dev_preds.append(preds)

    predsL = model.predict(X_test)
    test_preds.append(predsL)

    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    #preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)
    best_ratios = get_best_ratios(dev_preds, valid_y)
    print(best_ratios)
    preds = aggregate_predicts_N(dev_preds, best_ratios)
    print("(Best) RMSL error on dev set:", rmsle(Y_dev, Y_dev_preds))

    preds = aggregate_predicts_N(test_preds, best_ratios)
    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
Example #3
0
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')

    glove_file = '../feat/glove.6B.50d.txt'
    threads = 8
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    ix = (merge['brand_name'] == merge['brand_name']) & (
        ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(
            merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix]

    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    for c, (k, v) in enumerate(bigram_mapper.items()):
        print c, k, v
        merge['name'] = merge.name.str.replace(k, v)
        merge['item_description'] = merge.item_description.str.replace(k, v)
    '''
    Crossed columns
    '''

    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns

    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
        ['brand_name', 'item_condition_id_str'],
        ['brand_name', 'subcat_1'],
        ['brand_name', 'subcat_2'],
        ['brand_name', 'general_cat'],
        #['brand_name',  'subcat_1',  'item_condition_id_str'],
        #['brand_name',  'subcat_2',  'item_condition_id_str'],
        #['brand_name',  'general_cat',  'item_condition_id_str'],
        ['brand_name', 'shipping_str'],
        ['shipping_str', 'item_condition_id_str'],
        ['shipping_str', 'subcat_2'],
        ['item_condition_id_str', 'subcat_2'])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(merge.select_dtypes(include=['object']).columns)

    D = 2**30
    for k, v in crossed_columns_d.items():
        print('Crossed column ', k)
        outls_ = []
        indicator = 0
        for col in v:
            outls_.append((np.array(merge[col].apply(hash))) % D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del (lb)
    '''
    Encode Original Strings
    '''
    '''
    for col in ['item_description', 'name']:    
        lb = LabelBinarizer(sparse_output=True)
        if 'X_orig' not in locals():
            X_orig = lb.fit_transform(merge[col].apply(hash))
        else:
            X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash))))
    X_orig = X_orig.tocsr()
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)]
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)]    
    print ('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocoo()
    '''
    gc.collect()
    cpuStats()
    '''
    Hash name
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**20,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del (wb)
    X_cat = X_cat[:,
                  np.array(np.clip(X_cat.getnnz(axis=0) -
                                   1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    '''
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_orig.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat,
                           x_col, X_orig)).tocsr()
    '''

    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape)
    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_cat, x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline:
            baseline = score_
        else:
            break

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
        # 0.44532
        # Full data 0.424681

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
Example #4
0
def main():
    feature_vectorized_file_name = 'Data/feature_vectorized2'
    if os.path.exists(feature_vectorized_file_name) == False:
        sparse_merge, price = _load(feature_vectorized_file_name)
        print(sparse_merge.shape)
    else:
        ########################################################################
        start_time = time.time()
        merge, submission, price = get_extract_feature()
        merge = merge[:TRAIN_SIZE]

        merge['item_condition_id'] = merge['item_condition_id'].astype(
            'category')
        print('[{}] Convert categorical completed'.format(time.time() -
                                                          start_time))

        # vectorize features
        wb = CountVectorizer()
        X_category2 = wb.fit_transform(merge['category_2'])
        X_category3 = wb.fit_transform(merge['category_name'])
        X_brand2 = wb.fit_transform(merge['brand_name'])
        print(
            '[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                  start_time))

        lb = LabelBinarizer(sparse_output=True)
        X_brand = lb.fit_transform(merge['brand_name'])
        X_category1 = lb.fit_transform(merge['category_1'])
        X_category4 = lb.fit_transform(merge['category_name'])
        print(
            '[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                                 start_time))

        X_dummies = csr_matrix(
            pd.get_dummies(merge[['item_condition_id', 'shipping']],
                           sparse=True).values)

        # hand feature
        for col in merge.columns:
            if ('Len' in col) or ('Frec' in col):
                merge[col] = np.log1p(merge[col])
                merge[col] = merge[col] / merge[col].max()

        hand_feature = [
            'brand_name_Frec', 'item_description_wordLen',
            'brand_name_name_Intsct', 'brand_name_item_description_Intsct'
        ]
        X_hand_feature = merge[hand_feature].values

        name_w1 = param_space_best_WordBatch['name_w1']
        name_w2 = param_space_best_WordBatch['name_w2']
        desc_w1 = param_space_best_WordBatch['desc_w1']
        desc_w2 = param_space_best_WordBatch['desc_w2']

        wb = wordbatch.WordBatch(normalize_text=None,
                                 extractor=(WordBag, {
                                     "hash_ngrams":
                                     2,
                                     "hash_ngrams_weights": [name_w1, name_w2],
                                     "hash_size":
                                     2**28,
                                     "norm":
                                     None,
                                     "tf":
                                     'binary',
                                     "idf":
                                     None,
                                 }),
                                 procs=8)
        wb.dictionary_freeze = True
        X_name = wb.fit_transform(merge['name'])
        del (wb)
        X_name = X_name[:,
                        np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1),
                                 dtype=bool)]
        print('[{}] Vectorize `name` completed.'.format(time.time() -
                                                        start_time))

        merge['item_description'] = merge['category_2'].map(str)+' .#d3 .#d3 '+\
                                    merge['name'].map(str)+' .#d3 .#d3 '+\
                                    merge['item_description'].map(str)

        wb = wordbatch.WordBatch(normalize_text=None,
                                 extractor=(WordBag, {
                                     "hash_ngrams":
                                     3,
                                     "hash_ngrams_weights":
                                     [desc_w1, desc_w2, 0.7],
                                     "hash_size":
                                     2**28,
                                     "norm":
                                     "l2",
                                     "tf":
                                     1.0,
                                     "idf":
                                     None
                                 }),
                                 procs=8)
        wb.dictionary_freeze = True
        X_description = wb.fit_transform(merge['item_description'])
        del (wb)
        X_description = X_description[:,
                                      np.array(np.clip(
                                          X_description.getnnz(axis=0) -
                                          6, 0, 1),
                                               dtype=bool)]
        print(
            '[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                  start_time))

        sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1,
                               X_category2, X_category3, X_category4,
                               X_hand_feature, X_name, X_description)).tocsr()

        print(X_dummies.shape, X_brand.shape, X_brand2.shape,
              X_category1.shape, X_category2.shape, X_category3.shape,
              X_category4.shape, X_hand_feature.shape, X_name.shape,
              X_description.shape, sparse_merge.shape)

        _save(feature_vectorized_file_name, [sparse_merge, price])
        print('[{}] data saved.'.format(time.time() - start_time))

    ########################################################################
    # use hyperopt to find the best parameters of the model
    # use 3 fold cross validation

    # learner_name='best_FTRL'
    # learner_name='FTRL'
    learner_name = 'best_FM_FTRL'
    #learner_name='FM_FTRL'
    print(learner_name)
    logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name,
                                                time_utils._timestamp())
    logger = logging_utils._get_logger('Log', logname)
    logger.info('start')

    optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger)
    optimizer.run()

    a = 12
Example #5
0
def get_pred_ftrl(submission):
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table(
        '../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table(
        '../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    train = train[train["price"] != 0]
    #Xtrain,Xvalid = train_test_split(train, test_size=0.01,random_state=1)
    nrow_train = train.shape[0]
    #nrow_valid = Xvalid.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, test])
    #submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    #'''
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    submission['price_FTRL'] = predsF
    #print(rmsle(np.expm1(predsF),y_valid))
    #'''
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))
    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=17,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    submission['price_FM_FTRL'] = predsFM
Example #6
0
del(wb)
X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

wb = CountVectorizer()
X_category1 = wb.fit_transform(merge['general_cat'])
X_category2 = wb.fit_transform(merge['subcat_1'])
X_category3 = wb.fit_transform(merge['subcat_2'])
print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

# wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                              "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                              "idf": None})
                         , procs=8)
wb.dictionary_freeze= True

# p = Pool(processes=8)
# merge['item_description'] = p.map(transform, merge.item_description.values)
# p.terminate()

X_description = wb.fit_transform(merge['item_description'])
del(wb)
gc.collect()
X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

lb = LabelBinarizer(sparse_output=True)
X_brand = lb.fit_transform(merge['brand_name'])
print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))
Example #7
0
def main():
    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')

    print('Finished to load data')
    nrow_test = train.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train, test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('Split categories completed.')

    handle_missing_inplace(merge)
    print('Handle missing completed.')

    cutting(merge)
    print('Cut completed.')

    to_categorical(merge)
    print('Convert categorical completed')

    cv = CountVectorizer(min_df=NAME_MIN_DF)
    X_name_cv = cv.fit_transform(merge['name'])

    cv = CountVectorizer()
    X_category1_cv = cv.fit_transform(merge['general_cat'])
    X_category2_cv = cv.fit_transform(merge['subcat_1'])
    X_category3_cv = cv.fit_transform(merge['subcat_2'])

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 'hash_ngrams': 2,
                                 'hash_ngrams_weights': [1.5, 1.0],
                                 'hash_size': 2**29,
                                 'norm': None,
                                 'tf': 'binary',
                                 'idf': None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del wb
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('Vectorize `name` completed.')

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('Count vectorize `categories` completed.')

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 'hash_ngrams': 2,
                                 'hash_ngrams_weights': [1.0, 1.0],
                                 'hash_size': 2**28,
                                 'norm': 'l2',
                                 'tf': 1.0,
                                 'idf': None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del wb
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('Vectorize `item_description` completed.')

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('Label binarize `brand_name` completed.')

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('Get dummies on `item_condition_id` and `shipping` completed.')

    num_chars = merge['item_description'].apply(lambda x: len(x)).values
    num_words = merge['item_description'].apply(
        lambda x: len(x.split(' '))).values
    num_upper = merge['item_description'].apply(
        lambda x: len(re.findall('[A-Z]+', x))).values
    num_chars = num_chars / max(num_chars)
    num_words = num_words / max(num_words)
    num_upper = num_upper / max(num_upper)

    X_feature = np.vstack([num_chars, num_words, num_upper]).T
    print('musicmilif features completed.')

    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_category1_cv, X_category2_cv, X_category3_cv,
         X_name_cv, X_feature)).tocsr()
    print('Create sparse merge completed')
    del X_dummies, X_description, X_brand, X_category1, X_category2, X_category3
    del X_name, X_category1_cv, X_category2_cv, X_category3_cv, X_name_cv, X_feature
    del num_chars, num_words, num_upper
    gc.collect()

    # Remove features with document frequency <=1
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]

    gc.collect()

    train_X, train_y = X, y

    model = Ridge(solver='auto',
                  fit_intercept=True,
                  alpha=5.0,
                  max_iter=100,
                  normalize=False,
                  tol=0.05)
    model.fit(train_X, train_y)
    print('Train Ridge completed')
    predsR = model.predict(X_test)
    print('Predict Ridge completed')

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)
    model.fit(train_X, train_y)
    print('Train FTRL completed')
    predsF = model.predict(X_test)
    print('Predict FTRL completed')

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=17,
                    inv_link="identity",
                    threads=4)
    model.fit(train_X, train_y)
    print('Train FM_FTRL completed')
    predsFM = model.predict(X_test)
    print('Predict FM_FTRL completed')

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 9,
        'num_leaves': 24,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.9,
        'bagging_freq': 6,
        'feature_fraction': 0.8,
        'nthread': 4,
        'min_data_in_leaf': 51,
        'max_bin': 64
    }

    # Remove features with document frequency <=200
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 200, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]

    train_X, train_y = X, y
    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=1800,
                      valid_sets=watchlist,
                      early_stopping_rounds=500,
                      verbose_eval=400)

    predsL = model.predict(X_test)
    print('Predict LGBM completed')

    preds = (predsR * 1 + predsF * 1 + predsFM * 16 + predsL * 6) / (1 + 1 +
                                                                     16 + 6)
    submission['price'] = np.expm1(preds)
    submission.to_csv("submission.csv", index=False)