Esempio n. 1
0
    def __init__(self, pickle_model="", datadir=None):
        self.wb = wordbatch.WordBatch(
            normalize_text,
            extractor=(Hstack, [
                (WordVec, {
                    "wordvec_file":
                    "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                    "normalize_text": normalize_text
                }),
                (WordVec, {
                    "wordvec_file":
                    "../../../data/word2vec/glove.6B.50d.txt.gz",
                    "normalize_text": normalize_text
                })
            ]))

        self.wb.dictionary_freeze = True

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**25,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)
Esempio n. 2
0
class WordvecRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        self.wb= wordbatch.WordBatch(normalize_text,
                                            extractor=(Hstack,
                                   [(WordVec, {"wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                                    "normalize_text": normalize_text}),
                                   (WordVec, {"wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz",
                                 "normalize_text": normalize_text})]))

        self.wb.dictionary_freeze= True

        self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link= "identity")

        if datadir==None:  (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
        else: self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = self.wb.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        texts= self.wb.fit_transform(texts, reset= False)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset= False)

    def train(self, datadir, pickle_model=""):
        texts= []
        labels= []
        training_data= os.listdir(datadir)
        rcount= 0
        batchsize= 100000

        p= None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try: line= json.loads(line.strip())
                    except:  continue
                    for review in line["Reviews"]:
                        rcount+= 1
                        if rcount % 100000 == 0:  print(rcount)
                        if rcount % 6 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
                        if len(texts) % batchsize == 0:
                            if p != None:  p.join()
                            p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
                            p.start()
                            texts= []
                            labels= []
        if p != None:  p.join()
        self.fit_batch(texts, labels, rcount)

        if pickle_model!="":
            with gzip.open(pickle_model, 'wb') as model_file:
                pkl.dump((self.wb, self.clf), model_file, protocol=2)

    def predict(self, texts):
        vecs= self.wb.transform(texts)
        return self.clf.predict(vecs)
 def __init__(self, pickle_model="", datadir=None):
     self.wb = wordbatch.WordBatch(normalize_text,
                                   extractor=(WordBag, {
                                       "hash_ngrams":
                                       3,
                                       "hash_ngrams_weights":
                                       [-1.0, -1.0, 1.0],
                                       "hash_size":
                                       2**23,
                                       "norm":
                                       'l2',
                                       "tf":
                                       'binary',
                                       "idf":
                                       50.0
                                   }))
     self.clf = FTRL(alpha=1.0,
                     beta=1.0,
                     L1=0.00001,
                     L2=1.0,
                     D=2**23,
                     iters=1,
                     inv_link="identity")
     if datadir == None:
         (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
     else:
         self.train(datadir, pickle_model)
Esempio n. 4
0
class WordbagRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        self.wordbatch = wordbatch.WordBatch(normalize_text, extractors=[(wordbatch.WordBag, {"hash_ngrams":3,
          "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0})])
        self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity")
        if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, u'rb'))
        else: self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels= self.wordbatch.shuffle_batch(texts, labels, rcount)
        print "Transforming", rcount
        texts= self.wordbatch.transform(texts)
        print "Training", rcount
        self.clf.fit(texts, labels)

    def train(self, datadir, pickle_model=""):
        texts= []
        labels= []
        training_data= os.listdir(datadir)
        rcount= 0
        batchsize= 100000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, u'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try: line = json.loads(line.strip())
                    except:  continue
                    for review in line["Reviews"]:
                        rcount+= 1
                        if rcount % 100000 == 0:  print rcount
                        if rcount % 7 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
                        if len(texts) % batchsize == 0:
                            if p != None:  p.join()
                            p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
                            p.start()
                            texts= []
                            labels= []
        if p != None:  p.join()
        self.fit_batch(texts, labels, rcount)

        self.wordbatch.dictionary_freeze= True

        if pickle_model!="":
            with gzip.open(pickle_model, u'wb') as model_file:
                pkl.dump((self.wordbatch, self.clf), model_file, protocol=2)

    def predict(self, texts):
        counts= self.wordbatch.transform(texts)
        return self.clf.predict(counts)
    def __init__(self, param_dict, feature_dim):
        alpha = param_dict['alpha']
        beta = param_dict['beta']
        L1 = param_dict['L1']
        L2 = param_dict['L2']
        iters = param_dict['iters']

        self.model = FTRL(alpha=alpha,
                          beta=beta,
                          L1=L1,
                          L2=L2,
                          D=feature_dim,
                          iters=iters,
                          inv_link="identity",
                          threads=6)
Esempio n. 6
0
    def __init__(self, spar_type, spar_penalty):
        # We create a separate model for each action in the environment's
        # action space. Alternatively we could somehow encode the action
        # into the features, but this way it's easier to code up.
        self.models = []
        for _ in range(env.action_space.n):
            #model=Lasso(alpha=0.01)
            model = SGDRegressor(learning_rate='constant',
                                 penalty=spar_type,
                                 l1_ratio=spar_penalty,
                                 max_iter=1000)
            model1 = PassiveAggressiveRegressor()
            model2 = Lasso(alpha=0.1, normalize=True, warm_start=True)
            model3 = FTRL(alpha=1.0,
                          beta=1.0,
                          L1=0.00001,
                          L2=1.0,
                          D=2**25,
                          iters=1)
            #l2,l1,none,elasticnet
            #,penalty='l1',l1_ratio=0)
            #learning_rate="constant"

            # We need to call partial_fit once to initialize the model
            # or we get a NotFittedError when trying to make a prediction
            # This is quite hacky.
            #model2.fit([self.featurize_state(env.reset())], [0])
            #X = np.array([self.featurize_state(env.reset())])
            #Y = np.array([0])
            #print X.shape, Y.shape
            #model.partial_fit(X,Y)

            model.partial_fit([self.featurize_state(env.reset())], [0])
            self.models.append(model)
 def __init__(self, pickle_model="", datadir=None):
     from pyspark import SparkContext
     self.sc= SparkContext()
     self.wordbatch = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams":3,
       "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0}))
     self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity")
     self.wordbatch.use_sc= True
     if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
     else: self.train(datadir, pickle_model)
class vanila_FTRL_Regressor:
    def __init__(self, param_dict, feature_dim):
        alpha = param_dict['alpha']
        beta = param_dict['beta']
        L1 = param_dict['L1']
        L2 = param_dict['L2']
        iters = param_dict['iters']

        self.model = FTRL(alpha=alpha,
                          beta=beta,
                          L1=L1,
                          L2=L2,
                          D=feature_dim,
                          iters=iters,
                          inv_link="identity",
                          threads=6)

    def fit(self, X_train, y_train):
        self.model.fit(X_train, y_train)

    def predict(self, X_valid):
        return self.model.predict(X_valid)
Esempio n. 9
0
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(normalize_text=normalize_text,
                            extractor=WordBag(
                                hash_ngrams=3,
                                hash_ngrams_weights=[-1.0, -1.0, 1.0],
                                hash_size=2**23,
                                norm='l2',
                                tf='binary',
                                idf=50.0),
                            batcher=batcher)

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**23,
                        iters=1,
                        inv_link="identity")
        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)
Esempio n. 10
0
    def __init__(self, pickle_model="", datadir=None):
        self.wb= wordbatch.WordBatch(normalize_text,
                                            extractor=(Hstack,
                                   [(WordVec, {"wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                                    "normalize_text": normalize_text}),
                                   (WordVec, {"wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz",
                                 "normalize_text": normalize_text})]))

        self.wb.dictionary_freeze= True

        self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link= "identity")

        if datadir==None:  (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
        else: self.train(datadir, pickle_model)
Esempio n. 11
0
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(
            normalize_text,
            extractor=Hstack([
                WordVec(wordvec_file=
                        "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                        normalize_text=normalize_text,
                        encoding="utf8"),
                WordVec(
                    wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
                    normalize_text=normalize_text,
                    encoding="utf8")
            ]))
        # from wordbatch.pipelines import FeatureUnion
        # from wordbatch.transformers import Dictionary, TextNormalizer
        # from sklearn.pipeline import Pipeline
        # tn= TextNormalizer(normalize_text=normalize_text)
        # dct= Dictionary()
        # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))])
        self.batcher = batcher

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=100 + 50,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)
Esempio n. 12
0
def FTRL_train(train_X, train_y, isQuickRun):

    if isQuickRun:
        model = FTRL(alpha=0.01,
                     beta=0.1,
                     L1=0.00001,
                     L2=1.0,
                     D=train_X.shape[1],
                     iters=9,
                     inv_link="identity",
                     threads=4)
    else:
        model = FTRL(alpha=0.01,
                     beta=0.1,
                     L1=0.00001,
                     L2=1.0,
                     D=train_X.shape[1],
                     iters=47,
                     inv_link="identity",
                     threads=4)

    model.fit(train_X, train_y)

    return model
    gc.collect()

    # Take the log of the target
    y = np.log1p(target)

    if enable_validation:
        truth_sr = np.log1p(truth)

    del train, target
    gc.collect()

    FTRL_model = FTRL(alpha=0.01,
                      beta=0.1,
                      L1=0.00001,
                      L2=1.0,
                      D=X.shape[1],
                      iters=50,
                      inv_link="identity",
                      threads=1)
    FTRL_model.fit(X, y)
    print("[{}] Train FTRL completed".format(time.time() - start_time))

    FM_FTRL_model = FM_FTRL(alpha=0.01,
                            beta=0.01,
                            L1=0.00001,
                            L2=0.1,
                            D=X.shape[1],
                            alpha_fm=0.01,
                            L2_fm=0.0,
                            init_fm=0.01,
                            D_fm=200,
    aver_rmse = 0.0
    for train_index, val_index in kf.split(y):
        fold_id += 1
        print("Fold {} start...".format(fold_id))
        train_X, valid_X = X[train_index], X[val_index]
        train_y, valid_y = y[train_index], y[val_index]

        # train_X, valid_X, train_y, valid_y = train_test_split(X_train, y, test_size=0.2, random_state=42)
        # del X_train, y

        d_shape = train_X.shape[1]
        print('d_shape', d_shape)
        model = FTRL(alpha=0.01,
                     beta=0.1,
                     L1=0.1,
                     L2=10,
                     D=d_shape,
                     iters=5,
                     inv_link="identity",
                     threads=8)
        model.fit(train_X, train_y)

        def rmse(predictions, targets):
            print("calculating RMSE ...")
            return np.sqrt(((predictions - targets)**2).mean())

        preds_valid_ftrl = model.predict(X=valid_X)
        # print(" FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds_valid_ftrl)))
        print(" FTRL dev RMSLE:", rmse(valid_y, preds_valid_ftrl))
        # # model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X_train.shape[1], alpha_fm=0.01, L2_fm=0.0,
        # #                 init_fm=0.01,
        # #                 D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4)
Esempio n. 15
0
#from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.linear_model import *
#vct= HashingVectorizer()
#clf= SGDRegressor()

import wordbatch
from wordbatch.models import FTRL
from wordbatch.extractors import WordBag
wb= wordbatch.WordBatch(extractor=(WordBag, {"hash_ngrams":2, "hash_ngrams_weights":[0.5, -1.0], "hash_size":2**23, "norm":'l2', "tf":'log', "idf":50.0}))
clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1)

train_texts= ["Cut down a tree with a herring? It can't be done.", "Don't say that word.", "How can we not say the word if you don't tell us what it is?"]
train_labels= [1, 0, 1]
test_texts= ["Wait! I said it! I said it! Ooh! I said it again!"]

values = wb.transform(train_texts)
clf.fit(values, train_labels)
preds= clf.predict(wb.transform(test_texts))
print("values={}".format(values))
print("values={}".format(len(values)))
print("texts={}".format(test_texts))
print("transformed={}".format(wb.transform(test_texts)))
print(preds)
Esempio n. 16
0
def get_pred_ftrl(submission):
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table(
        '../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table(
        '../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    train = train[train["price"] != 0]
    #Xtrain,Xvalid = train_test_split(train, test_size=0.01,random_state=1)
    nrow_train = train.shape[0]
    #nrow_valid = Xvalid.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, test])
    #submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    #'''
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    submission['price_FTRL'] = predsF
    #print(rmsle(np.expm1(predsF),y_valid))
    #'''
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))
    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=17,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    submission['price_FM_FTRL'] = predsFM
Esempio n. 17
0
class WordvecRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        self.wb = wordbatch.WordBatch(
            normalize_text,
            extractor=(Hstack, [
                (WordVec, {
                    "wordvec_file":
                    "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                    "normalize_text": normalize_text
                }),
                (WordVec, {
                    "wordvec_file":
                    "../../../data/word2vec/glove.6B.50d.txt.gz",
                    "normalize_text": normalize_text
                })
            ]))

        self.wb.dictionary_freeze = True

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**25,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = self.wb.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        texts = self.wb.transform(texts)
        print("Training", rcount)
        self.clf.fit(texts, labels)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 100000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 6 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        if pickle_model != "":
            with gzip.open(pickle_model, 'wb') as model_file:
                pkl.dump((self.wb, self.clf), model_file, protocol=2)

    def predict(self, texts):
        vecs = self.wb.transform(texts)
        return self.clf.predict(vecs)
Esempio n. 18
0
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    ###train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    ###test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    submission = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=15,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.6,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        watchlist = [d_train, d_valid]

    model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    if develop:
        preds = model.predict(valid_X)
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)

    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
Esempio n. 19
0
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    X_name = wb.fit_transform(merge['name'])
    del(wb)
    X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                                  "idf": None})
                             , procs=8)
    wb.dictionary_freeze= True
    X_description = wb.fit_transform(merge['item_description'])
    del(wb)
    X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))

    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() - start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

    model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.6,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        watchlist = [d_train, d_valid]

    model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    if develop:
        preds = model.predict(valid_X)
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)

    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
Esempio n. 20
0
class WordbagRegressor(object):
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(normalize_text=normalize_text,
                            extractor=WordBag(
                                hash_ngrams=3,
                                hash_ngrams_weights=[-1.0, -1.0, 1.0],
                                hash_size=2**23,
                                norm='l2',
                                tf='binary',
                                idf=50.0),
                            batcher=batcher)

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=2**23,
                        iters=1,
                        inv_link="identity")
        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = self.wb.batcher.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        texts = self.wb.fit_transform(texts, reset=False)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 100000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 7 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        self.wb.dictionary_freeze = True

        if pickle_model != "":
            with gzip.open(pickle_model, 'wb') as model_file:
                backend = self.wb.batcher.backend
                backend_handle = self.wb.batcher.backend_handle
                self.wb.batcher.backend = "serial"
                self.wb.batcher.backend_handle = None
                pkl.dump((self.wb, self.clf), model_file, protocol=2)
                self.wb.batcher.backend = backend
                self.wb.batcher.backend_handle = backend_handle

    def predict(self, texts):
        counts = self.wb.transform(texts)
        return self.clf.predict(counts)
Esempio n. 21
0
class WordbagRegressor(object):
	def __init__(self, pickle_model="", datadir=None):
		from pyspark import SparkContext
		self.sc= SparkContext()
		self.wordbatch = wordbatch.WordBatch(normalize_text, backend="spark", backend_handle=self.sc,
		                                     extractor=(WordBag, {"hash_ngrams":3,
		                                                          "hash_ngrams_weights":[-1.0, -1.0, 1.0],
		                                                          "hash_size":2**23, "norm":'l2',
		                                                          "tf":'binary', "idf":50.0}))
		self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity")
		if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
		else: self.train(datadir, pickle_model)

	def fit_batch(self, texts, labels, rcount):
		print("Transforming", rcount)
		# if self.sc != None:
		# 	data_rdd= self.wordbatch.lists2rddbatches([texts, labels], self.sc)
		# 	data_rdd= self.wordbatch.transform(data_rdd)
		# 	[texts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
		# else:
		# print(texts[:2])
		# print(pd.Series(labels).value_counts())
		texts= self.wordbatch.partial_fit_transform(texts)
		print("Training", rcount)
		self.clf.partial_fit(texts, labels)

	def train(self, datadir, pickle_model=""):
		texts= []
		labels= []
		training_data= os.listdir(datadir)
		rcount= 0
		batchsize= 20000

		p = None
		for jsonfile in training_data:
			with open(datadir + "/" + jsonfile, 'r') as inputfile:
				for line in inputfile:
					#if rcount > 1000000: break
					try: line = json.loads(line.strip())
					except:  continue
					for review in line["Reviews"]:
						rcount+= 1
						if rcount % 100000 == 0:  print(rcount)
						if rcount % 7 != 0: continue
						if "Overall" not in review["Ratings"]: continue
						texts.append(review["Content"])
						labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
						if len(texts) % batchsize == 0:
							if p != None:  p.join()
							p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
							p.start()
							texts= []
							labels= []
		if p != None:  p.join()
		self.fit_batch(texts, labels, rcount)

		self.wordbatch.dictionary_freeze= True

		if pickle_model!="":
			with gzip.open(pickle_model, 'wb') as model_file:
				pkl.dump((self.wordbatch, self.clf), model_file, protocol=2)

	def predict(self, texts):
		# if self.sc != None:
		# 	data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc)
		# 	data_rdd= self.wordbatch.transform(data_rdd)
		# 	[counts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
		# else:
		counts= self.wordbatch.transform(texts)
		return self.clf.predict(counts)

	def predict_parallel(self, texts):
		# if self.sc != None:
		# 	data_rdd= self.wordbatch.lists2rddbatches([texts, []] , self.sc)
		# 	counts_rdd= self.wordbatch.transform(data_rdd)
		# 	return self.wordbatch.rddbatches2lists(self.wordbatch.predict_parallel(counts_rdd, self.clf))[0]
		counts= self.wordbatch.transform(texts)
		return self.wordbatch.predict_parallel(counts, self.clf)
Esempio n. 22
0
def main(test, logger):

    logger.info('Start . . .')
    train = pd.read_table('../input/train.tsv', engine='c')
    logger.info('Load train')
    logger.info('train shape {}'.format(train.shape))
    logger.info('test shape {}'.format(test.shape))
    nrow_train = train.shape[0]
    y = np.log1p(train['price'])

    train_low_price = train.loc[train['price'] < 1.]
    train = train.drop(train[train['price'] < 1.].index)
    del train_low_price['price']
    logger.info('train_low_price shape {}'.format(train_low_price.shape))

    df_full = pd.concat([train, train_low_price, test])
    logger.info('df_full shape {}'.format(df_full.shape))

    sub = test[['test_id']]
    logger.info('sub shape {}'.format(sub.shape))

    del train, test
    gc.collect()

    df_full['general_cat'], df_full['subcat_1'], df_full['subcat_2'] = zip(
        *df_full['category_name'].apply(lambda x: split_category(x)))
    df_full.drop(['category_name'], axis=1, inplace=True)
    logger.info('Split category_name')
    gc.collect()

    df_full = impute_missing_value(df_full)
    logger.info('Impute missing value')
    gc.collect()

    df_full = cut_df(df_full)
    logger.info('Cut categories')
    gc.collect()

    df_full = to_categorical(df_full)
    logger.info('Convert to categorical features')
    gc.collect()

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(df_full['name'])
    del wb
    gc.collect()
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    logger.info('Vectorize name')
    gc.collect()

    cnt_vec = CountVectorizer()
    X_cat_1 = cnt_vec.fit_transform(df_full['general_cat'])
    X_cat_2 = cnt_vec.fit_transform(df_full['subcat_1'])
    X_cat_3 = cnt_vec.fit_transform(df_full['subcat_2'])
    df_full.drop(['general_cat', 'subcat_1', 'subcat_2'], axis=1, inplace=True)
    del cnt_vec
    gc.collect()
    logger.info('Vectorize category (general_cat, subcat_1, subcat_2)')

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**29,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=2)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(df_full['item_description'])
    del wb
    gc.collect()
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    logger.info('Vectorize item_description')
    gc.collect()

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(df_full['brand_name'])
    df_full.drop(['brand_name'], axis=1, inplace=True)
    del lb
    gc.collect()
    logger.info('Label binarize brand_name')

    X_dummies = csr_matrix(
        pd.get_dummies(df_full[['item_condition_id', 'shipping']],
                       sparse=True).values)
    df_full.drop(['item_condition_id', 'shipping'], axis=1, inplace=True)
    logger.info('Get dummies on item_condition_id and shipping')
    gc.collect()

    sparse_merge = hstack((X_dummies, X_description, X_brand, X_cat_1, X_cat_2,
                           X_cat_3, X_name)).tocsr()
    logger.info('Create sparse features')
    logger.info('sparse_merge shape {}'.format(sparse_merge.shape))
    del X_dummies, X_description, X_brand, X_cat_1, X_cat_2, X_cat_3, X_name
    gc.collect()

    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    logger.info('Remove features with doc frequency <= 1')
    logger.info('sparse_merge shape {}'.format(sparse_merge.shape))

    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_train:]
    sparse_merge_shape = sparse_merge.shape
    del sparse_merge
    gc.collect()

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge_shape[1],
                 iters=30,
                 inv_link="identity",
                 threads=1)
    model.fit(X, y)
    logger.info('Fit FTRL')
    preds_FTRL = model.predict(X_test)
    logger.info('Predict FTRL')

    model = FM_FTRL(alpha=0.01,
                    beta=0.1,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge_shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=20,
                    inv_link="identity",
                    threads=4)
    model.fit(X, y)
    logger.info('Fit FM_FTRL')
    preds_FM_FTRL = model.predict(X_test)
    logger.info('Predict FM_FTRL')

    preds = (np.expm1(preds_FTRL) * 0.15 + np.expm1(preds_FM_FTRL) * 0.85)
    logger.info('Final predictions generated')
    return preds
Esempio n. 23
0
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    train = pd.read_table(
        '../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    test = pd.read_table(
        '../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    #train = pd.read_table('../input/train.tsv', engine='c')
    #test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))
    del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name
    gc.collect()

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=30,
                 inv_link="identity",
                 threads=1)
    del X
    gc.collect()
    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.012,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=17,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    del train_X, train_y
    gc.collect()
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    del X_test
    gc.collect()
    params = {
        'learning_rate': 0.65,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 42,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.71,
        'bagging_freq': 5,
        'feature_fraction': 0.67,
        'nthread': 4,
        'min_data_in_leaf': 120,
        'max_bin': 40
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    del sparse_merge
    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)
    del X, y
    gc.collect()
    d_train = lgb.Dataset(train_X, label=train_y)
    # del train_X, train_y; gc.collect()
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        del valid_y
        gc.collect()
        watchlist = [d_train, d_valid]

    #model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \
    #                  early_stopping_rounds=1000, verbose_eval=1000)

    model = lgb.train(params, train_set=d_train, num_boost_round=3000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    del d_train
    gc.collect()
    if develop:
        preds = model.predict(valid_X)
        del valid_X
        gc.collect()
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)
    # del X_test; gc.collect()
    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    #--- BEGIN Huber
    # Details: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html

    # class sklearn.linear_model.HuberRegressor(epsilon=1.35,
    #      max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True,
    #      tol=1e-05)[source]

    setup_Huber = 2

    if (setup_Huber == 1):
        model = HuberRegressor(fit_intercept=True,
                               alpha=0.01,
                               max_iter=80,
                               epsilon=363)

    if (setup_Huber == 2):
        model = HuberRegressor(fit_intercept=True,
                               alpha=0.05,
                               max_iter=200,
                               epsilon=1.2)

    model.fit(train_X, train_y)
    print('[{}] Predict Huber completed.'.format(time.time() - start_time))
    predsH = model.predict(X=X_test)
    #--- END Huber

    # original
    # preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    # modified setup (IT NEEDS MORE TUNING TESTS)
    w = (0.09, 0.11, 0.23, 0.57)

    preds = predsH * w[0] + predsF * w[1] + predsL * w[2] + predsFM * w[3]

    submission['price'] = np.expm1(preds)
    submission.to_csv("sub ftrl_fm_lgb_huber v3.csv", index=False)

    nm = (time.time() - start_time) / 60
    print("Total processing time %s min" % nm)
Esempio n. 24
0
#    sparse_merge, y = pd.read_pickle("xy.pkl")

# Remove features with document frequency <=1
print(sparse_merge.shape)
mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
sparse_merge = sparse_merge[:, mask]
X = sparse_merge[:nrow_train]
X_test = sparse_merge[nrow_test:]
print(sparse_merge.shape)
d_shape = sparse_merge.shape[1]
gc.collect()
train_X, train_y = X, y
if develop:
    train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)

model = FTRL(alpha=0.1680, beta=9.7895, L1=0.0011, L2=8.9635, D=d_shape, iters=int(11), inv_link="identity", threads=4)
###

del lb
del mask
del X_name
del X_category1
del X_category2
del X_category3
del X
del y
del merge
del X_dummies
del X_brand
del dftt
del X_description
Esempio n. 25
0
class WordvecRegressor(object):
    def __init__(self, pickle_model="", datadir=None, batcher=None):
        self.wb = WordBatch(
            normalize_text,
            extractor=Hstack([
                WordVec(wordvec_file=
                        "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
                        normalize_text=normalize_text,
                        encoding="utf8"),
                WordVec(
                    wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
                    normalize_text=normalize_text,
                    encoding="utf8")
            ]))
        # from wordbatch.pipelines import FeatureUnion
        # from wordbatch.transformers import Dictionary, TextNormalizer
        # from sklearn.pipeline import Pipeline
        # tn= TextNormalizer(normalize_text=normalize_text)
        # dct= Dictionary()
        # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz",
        # 			  normalize_text=normalize_text, encoding="utf8", dictionary= dct)
        # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))])
        self.batcher = batcher

        self.clf = FTRL(alpha=1.0,
                        beta=1.0,
                        L1=0.00001,
                        L2=1.0,
                        D=100 + 50,
                        iters=1,
                        inv_link="identity")

        if datadir == None:
            (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
        else:
            self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels = shuffle(texts, labels)
        print("Transforming", rcount)
        #texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher)
        texts = self.wb.fit_transform(texts)
        print("Training", rcount)
        self.clf.fit(texts, labels, reset=False)

    def train(self, datadir, pickle_model=""):
        texts = []
        labels = []
        training_data = os.listdir(datadir)
        rcount = 0
        batchsize = 80000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try:
                        line = json.loads(line.strip())
                    except:
                        continue
                    for review in line["Reviews"]:
                        rcount += 1
                        if rcount % 100000 == 0: print(rcount)
                        if rcount % 6 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append(
                            (float(review["Ratings"]["Overall"]) - 3) * 0.5)
                        if len(texts) % batchsize == 0:
                            if p != None: p.join()
                            p = threading.Thread(target=self.fit_batch,
                                                 args=(texts, labels, rcount))
                            p.start()
                            texts = []
                            labels = []
        if p != None: p.join()
        self.fit_batch(texts, labels, rcount)

        # if pickle_model!="":
        # 	with gzip.open(pickle_model, 'wb') as model_file:
        # 		backend = self.wb.batcher.backend
        # 		backend_handle = self.wb.batcher.backend_handle
        # 		self.wb.batcher.backend = "serial"
        # 		self.wb.batcher.backend_handle = None
        # 		pkl.dump((self.wb, self.clf), model_file, protocol=2)
        # 		self.wb.batcher.backend = backend
        # 		self.wb.batcher.backend_handle = backend_handle

    def predict(self, texts):
        vecs = self.wb.transform(texts)
        return self.clf.predict(vecs)
Esempio n. 26
0
def wordbatch_algo(test):
    import time

    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    # train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    # test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    train = pd.read_table('../input/train.tsv', engine='c')
    # test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    # submission: pd.DataFrame = test[['test_id']]
    '''
    # Mean of each group # https://stackoverflow.com/questions/30244952/python-pandas-create-new-column-with-groupby-sum
    cat_mean = train['price'].groupby(train['category_name']).mean()
    cat_mean = pd.DataFrame({'category_name':cat_mean.index, 'cat_mean':cat_mean.values})
    merge = merge.merge(cat_mean, on=['category_name'], how='left')
    # print(merge.head())
    X_cat_mean = merge['cat_mean'].as_matrix().reshape(-1, 1)
    # X_cat_mean = normalize(np.nan_to_num(X_cat_mean).reshape(-1, 1), norm='max')  
    
    
    cond_mean = train['price'].groupby(train['item_condition_id']).mean()
    cond_mean = pd.DataFrame({'item_condition_id':cond_mean.index, 'cond_mean':cond_mean.values})
    merge = merge.merge(cond_mean, on=['item_condition_id'], how='left')
    X_cond_mean = merge['cond_mean'].as_matrix().reshape(-1, 1)
    

    brand_mean = train['price'].groupby(train['brand_name']).mean()
    brand_mean = pd.DataFrame({'brand_name':brand_mean.index, 'brand_mean':brand_mean.values})
    merge = merge.merge(brand_mean, on=['brand_name'], how='left')
    X_brand_mean = merge['brand_mean'].as_matrix().reshape(-1, 1)
    

    ship_mean = train['price'].groupby(train['shipping']).mean()
    ship_mean = pd.DataFrame({'shipping':ship_mean.index, 'ship_mean':ship_mean.values})
    merge = merge.merge(ship_mean, on=['shipping'], how='left')
    X_ship_mean = merge['ship_mean'].as_matrix().reshape(-1, 1)
    '''

    del train
    del test
    gc.collect()



    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    # Add some new features:
    X_len_desc = merge['item_description'].apply(
        lambda x: len(x)).as_matrix().reshape(-1, 1)
    X_len_name = merge['name'].apply(lambda x: len(x)).as_matrix().reshape(
        -1, 1)

    # X_len_description = normalize(np.nan_to_num(X_len_description).reshape(-1, 1), norm='max')
    # X_len_name = normalize(np.nan_to_num(X_len_name).reshape(-1, 1), norm='max')

    print('[{}] Length `item_description` completed.'.format(time.time() -
                                                             start_time))
    print('[{}] Length `name` completed.'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(
        X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape,
        X_category2.shape, X_category3.shape, X_name.shape
    )  #, X_glove.shape, X_len_description.shape, X_len_name.shape, X_cat_mean.shape)
    # sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))
    del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name
    gc.collect()

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED)

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=FM_iter,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    gc.collect()
    print('[{}] Train FM_FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    gc.collect()
    print(predsFM)

    #model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)
    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=FTRL_iter,
                 inv_link="identity",
                 threads=1)
    del X
    gc.collect()
    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))
    print(predsF)

    del train_X, train_y
    del X_test

    return predsFM, predsF
Esempio n. 27
0
 del(wb)
 X_description = X_description[:, np.where(X_description.getnnz(axis=0) > 1)[0]]
 lb = LabelBinarizer(sparse_output=True)
 X_brand = lb.fit_transform(merge['Source'])
 X_dummies = csr_matrix(pd.get_dummies(merge[['IDLink', 'Facebook']],
                                       sparse=True).values)
 sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_name)).tocsr()
 sparse_merge = sparse_merge[:, np.where(sparse_merge.getnnz(axis=0) > 100)[0]]
 X = sparse_merge[:nrow_train]
 X_test = sparse_merge[nrow_test:]
 print(sparse_merge.shape)
 gc.collect()
 train_X, train_y = X, y
 if develop:
     train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100)
 model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)
 model.fit(train_X, train_y)
 print('[{}] Train FTRL completed'.format(time.time() - start_time))
 if develop:
     preds = model.predict(X=valid_X)
     print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
 predsF = model.predict(X_test)
 print('[{}] Predict FTRL completed'.format(time.time() - start_time))
 model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01,
                 D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4)
 model.fit(train_X, train_y)
 print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
 if develop:
     preds = model.predict(X=valid_X)
     print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))
 predsFM = model.predict(X_test)
Esempio n. 28
0
                  use_avx=1,
                  verbose=0)
    # iters changed to 2 from 116_7
    # threads=4, use_avx=1, verbose=0)
elif wordbatch_model == 'NN_ReLU_H1':
    clf = NN_ReLU_H1(alpha=0.05,
                     D=D,
                     verbose=9,
                     e_noise=0.0,
                     threads=4,
                     inv_link="sigmoid")
elif wordbatch_model == 'FTRL':
    clf = FTRL(alpha=0.05,
               beta=0.1,
               L1=0.0,
               L2=0.0,
               D=D,
               iters=3,
               threads=4,
               verbose=9)

dtypes = {
    'ip': 'uint32',
    'app': 'uint16',
    'device': 'uint16',
    'os': 'uint16',
    'channel': 'uint16',
    'is_attributed': 'uint8',
}

p = None
rcount = 0
class WordbagRegressor(object):
    def __init__(self, pickle_model="", datadir=None):
        from pyspark import SparkContext
        self.sc= SparkContext()
        self.wordbatch = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams":3,
          "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0}))
        self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity")
        self.wordbatch.use_sc= True
        if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
        else: self.train(datadir, pickle_model)

    def fit_batch(self, texts, labels, rcount):
        texts, labels= self.wordbatch.shuffle_batch(texts, labels, rcount)
        print("Transforming", rcount)
        if self.sc != None:
            data_rdd= self.wordbatch.lists2rddbatches([texts, labels], self.sc)
            data_rdd= self.wordbatch.transform(data_rdd)
            [texts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
        else:
            texts= self.wordbatch.transform(texts)
        print("Training", rcount)
        self.clf.fit(texts, labels)

    def train(self, datadir, pickle_model=""):
        texts= []
        labels= []
        training_data= os.listdir(datadir)
        rcount= 0
        batchsize= 20000

        p = None
        for jsonfile in training_data:
            with open(datadir + "/" + jsonfile, 'r') as inputfile:
                for line in inputfile:
                    #if rcount > 1000000: break
                    try: line = json.loads(line.strip())
                    except:  continue
                    for review in line["Reviews"]:
                        rcount+= 1
                        if rcount % 100000 == 0:  print(rcount)
                        if rcount % 7 != 0: continue
                        if "Overall" not in review["Ratings"]: continue
                        texts.append(review["Content"])
                        labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5)
                        if len(texts) % batchsize == 0:
                            if p != None:  p.join()
                            p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount))
                            p.start()
                            texts= []
                            labels= []
        if p != None:  p.join()
        self.fit_batch(texts, labels, rcount)

        self.wordbatch.dictionary_freeze= True

        if pickle_model!="":
            with gzip.open(pickle_model, 'wb') as model_file:
                pkl.dump((self.wordbatch, self.clf), model_file, protocol=2)

    def predict(self, texts):
        if self.sc != None:
            data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc)
            data_rdd= self.wordbatch.transform(data_rdd)
            [counts, labels]= self.wordbatch.rddbatches2lists(data_rdd)
        else: counts= self.wordbatch.transform(texts)
        return self.clf.predict(counts)

    def predict_parallel(self, texts):
        if self.sc != None:
            data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc)
            counts_rdd= self.wordbatch.transform(data_rdd)
            return self.wordbatch.rddbatches2lists(self.wordbatch.predict_parallel(counts_rdd, self.clf))[0]
        counts= self.wordbatch.transform(texts)
        return self.wordbatch.predict_parallel(counts, self.clf)
Esempio n. 30
0
def wordbatch_algo():
    import time

    # print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    train = pd.read_table('../input/train.tsv', engine='c')
    # Drop rows where price = 0
    train = train[train.price != 0].reset_index(drop=True)
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)

    y = np.log1p(train["price"])

    nrow_train = train.shape[0]

    # Training
    train['general_cat'], train['subcat_1'], train['subcat_2'] = \
        zip(*train['category_name'].apply(lambda x: split_cat(x)))
    train.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(train)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(train)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(train)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    # Add some new features:
    X_len_desc = train['item_description'].apply(
        lambda x: len(x)).as_matrix().reshape(-1, 1)
    X_len_name = train['name'].apply(lambda x: len(x)).as_matrix().reshape(
        -1, 1)

    print('[{}] Length of text completed.'.format(time.time() - start_time))

    # Name
    wb_name = wordbatch.WordBatch(normalize_text,
                                  extractor=(WordBag, {
                                      "hash_ngrams": 2,
                                      "hash_ngrams_weights": [1.5, 1.0],
                                      "hash_size": 2**29,
                                      "norm": None,
                                      "tf": 'binary',
                                      "idf": None,
                                  }),
                                  procs=8)

    wb_name.dictionary_freeze = True
    wb_name.fit(train['name'])
    X_name = wb_name.transform(train['name'])

    # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb_cat1 = CountVectorizer()
    wb_cat2 = CountVectorizer()
    wb_cat3 = CountVectorizer()
    wb_cat1.fit(train['general_cat'])
    wb_cat2.fit(train['subcat_1'])
    wb_cat3.fit(train['subcat_2'])

    X_category1 = wb_cat1.transform(train['general_cat'])
    X_category2 = wb_cat2.transform(train['subcat_1'])
    X_category3 = wb_cat3.transform(train['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb_desc = wordbatch.WordBatch(normalize_text,
                                  extractor=(WordBag, {
                                      "hash_ngrams": 2,
                                      "hash_ngrams_weights": [1.0, 1.0],
                                      "hash_size": 2**28,
                                      "norm": "l2",
                                      "tf": 1.0,
                                      "idf": None
                                  }),
                                  procs=8)
    wb_desc.dictionary_freeze = True
    wb_desc.fit(train['item_description'])
    X_description = wb_desc.transform(train['item_description'])

    # X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    lb.fit(train['brand_name'])
    X_brand = lb.transform(train['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_cond, d_cond = fit_dummy(train['item_condition_id'].tolist())
    X_ship, d_ship = fit_dummy(train['shipping'].tolist())

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))

    del train
    gc.collect()

    print(X_cond.shape, X_ship.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_cond, X_ship, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))
    del X_description, X_brand, X_category1, X_category2, X_category3, X_name
    gc.collect()

    # Remove features with document frequency <=1

    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    print(sparse_merge.shape)
    X = sparse_merge

    # ---------------------------------------
    # FM model fit
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED)

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=train_X.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=FM_iter,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train FM_FTRL completed'.format(time.time() - start_time))
    print('-' * 20)
    if develop:
        preds = model.predict(X=valid_X)
        print("->>>>  FM_FTRL dev RMSLE:",
              rmsle(np.expm1(valid_y), np.expm1(preds)))

    # ---------------------------------------
    # FTRL model fit
    model2 = FTRL(alpha=0.01,
                  beta=0.01,
                  L1=0.00001,
                  L2=1.0,
                  D=train_X.shape[1],
                  iters=FTRL_iter,
                  inv_link="identity",
                  threads=1)
    # del X; gc.collect()
    model2.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model2.predict(X=valid_X)
        print("->>>>  FTRL dev RMSLE:",
              rmsle(np.expm1(valid_y), np.expm1(preds)))

    # Clear variables:
    del X, train_X, train_y, sparse_merge
    gc.collect()

    # ---------------------------------------
    # Testing by chunk
    print(' FM/FTRL: ...reading the test data...')
    predsFM = []
    predsF = []

    for test in load_test():
        test['general_cat'], test['subcat_1'], test['subcat_2'] = \
            zip(*test['category_name'].apply(lambda x: split_cat(x)))
        test.drop('category_name', axis=1, inplace=True)

        handle_missing_inplace(test)
        #print('[{}] Handle missing completed.'.format(time.time() - start_time))

        cutting(test)
        # print('[{}] Cut completed.'.format(time.time() - start_time))

        to_categorical(test)
        # print('[{}] Convert categorical completed'.format(time.time() - start_time))

        # Add some new features:
        X_len_desc_test = test['item_description'].apply(
            lambda x: len(x)).as_matrix().reshape(-1, 1)
        X_len_name_test = test['name'].apply(
            lambda x: len(x)).as_matrix().reshape(-1, 1)

        X_name_test = wb_name.transform(test['name'])
        # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

        X_category1_test = wb_cat1.transform(test['general_cat'])
        X_category2_test = wb_cat2.transform(test['subcat_1'])
        X_category3_test = wb_cat3.transform(test['subcat_2'])

        X_description_test = wb_desc.transform(test['item_description'])
        # X_description_test = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

        X_brand_test = lb.transform(test['brand_name'])

        X_cond_test = transform_dummy(test['item_condition_id'].tolist(),
                                      d_cond)
        X_ship_test = transform_dummy(test['shipping'].tolist(), d_ship)


        X_test = hstack((X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, \
                         X_category2_test, X_category3_test, X_name_test)).tocsr()
        X_test = X_test[:, mask]

        # Clear variables:
        del X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, X_category2_test, X_category3_test, X_name_test
        del test
        gc.collect()

        predsFM_batch = model.predict(X_test)
        predsFM += np.array(predsFM_batch).flatten().tolist()

        predsF_batch = model2.predict(X_test)
        predsF += np.array(predsF_batch).flatten().tolist()

    print(np.array(predsFM))
    print('-' * 20)

    print(np.array(predsF))
    print('-' * 20)

    return np.array(predsFM), np.array(predsF)
Esempio n. 31
0
def main():
    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')

    print('Finished to load data')
    nrow_test = train.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    submission: pd.DataFrame = test[['test_id']]

    del train, test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('Split categories completed.')

    handle_missing_inplace(merge)
    print('Handle missing completed.')

    cutting(merge)
    print('Cut completed.')

    to_categorical(merge)
    print('Convert categorical completed')

    cv = CountVectorizer(min_df=NAME_MIN_DF)
    X_name_cv = cv.fit_transform(merge['name'])

    cv = CountVectorizer()
    X_category1_cv = cv.fit_transform(merge['general_cat'])
    X_category2_cv = cv.fit_transform(merge['subcat_1'])
    X_category3_cv = cv.fit_transform(merge['subcat_2'])

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 'hash_ngrams': 2,
                                 'hash_ngrams_weights': [1.5, 1.0],
                                 'hash_size': 2**29,
                                 'norm': None,
                                 'tf': 'binary',
                                 'idf': None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del wb
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('Vectorize `name` completed.')

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('Count vectorize `categories` completed.')

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 'hash_ngrams': 2,
                                 'hash_ngrams_weights': [1.0, 1.0],
                                 'hash_size': 2**28,
                                 'norm': 'l2',
                                 'tf': 1.0,
                                 'idf': None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del wb
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('Vectorize `item_description` completed.')

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('Label binarize `brand_name` completed.')

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('Get dummies on `item_condition_id` and `shipping` completed.')

    num_chars = merge['item_description'].apply(lambda x: len(x)).values
    num_words = merge['item_description'].apply(
        lambda x: len(x.split(' '))).values
    num_upper = merge['item_description'].apply(
        lambda x: len(re.findall('[A-Z]+', x))).values
    num_chars = num_chars / max(num_chars)
    num_words = num_words / max(num_words)
    num_upper = num_upper / max(num_upper)

    X_feature = np.vstack([num_chars, num_words, num_upper]).T
    print('musicmilif features completed.')

    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_category1_cv, X_category2_cv, X_category3_cv,
         X_name_cv, X_feature)).tocsr()
    print('Create sparse merge completed')
    del X_dummies, X_description, X_brand, X_category1, X_category2, X_category3
    del X_name, X_category1_cv, X_category2_cv, X_category3_cv, X_name_cv, X_feature
    del num_chars, num_words, num_upper
    gc.collect()

    # Remove features with document frequency <=1
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]

    gc.collect()

    train_X, train_y = X, y

    model = Ridge(solver='auto',
                  fit_intercept=True,
                  alpha=5.0,
                  max_iter=100,
                  normalize=False,
                  tol=0.05)
    model.fit(train_X, train_y)
    print('Train Ridge completed')
    predsR = model.predict(X_test)
    print('Predict Ridge completed')

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)
    model.fit(train_X, train_y)
    print('Train FTRL completed')
    predsF = model.predict(X_test)
    print('Predict FTRL completed')

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=17,
                    inv_link="identity",
                    threads=4)
    model.fit(train_X, train_y)
    print('Train FM_FTRL completed')
    predsFM = model.predict(X_test)
    print('Predict FM_FTRL completed')

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 9,
        'num_leaves': 24,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.9,
        'bagging_freq': 6,
        'feature_fraction': 0.8,
        'nthread': 4,
        'min_data_in_leaf': 51,
        'max_bin': 64
    }

    # Remove features with document frequency <=200
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 200, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]

    train_X, train_y = X, y
    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=1800,
                      valid_sets=watchlist,
                      early_stopping_rounds=500,
                      verbose_eval=400)

    predsL = model.predict(X_test)
    print('Predict LGBM completed')

    preds = (predsR * 1 + predsF * 1 + predsFM * 16 + predsL * 6) / (1 + 1 +
                                                                     16 + 6)
    submission['price'] = np.expm1(preds)
    submission.to_csv("submission.csv", index=False)