def __init__(self, pickle_model="", datadir=None): self.wb = wordbatch.WordBatch( normalize_text, extractor=(Hstack, [ (WordVec, { "wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", "normalize_text": normalize_text }), (WordVec, { "wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz", "normalize_text": normalize_text }) ])) self.wb.dictionary_freeze = True self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**25, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
class WordvecRegressor(object): def __init__(self, pickle_model="", datadir=None): self.wb= wordbatch.WordBatch(normalize_text, extractor=(Hstack, [(WordVec, {"wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", "normalize_text": normalize_text}), (WordVec, {"wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz", "normalize_text": normalize_text})])) self.wb.dictionary_freeze= True self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link= "identity") if datadir==None: (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels = self.wb.shuffle_batch(texts, labels, rcount) print("Transforming", rcount) texts= self.wb.fit_transform(texts, reset= False) print("Training", rcount) self.clf.fit(texts, labels, reset= False) def train(self, datadir, pickle_model=""): texts= [] labels= [] training_data= os.listdir(datadir) rcount= 0 batchsize= 100000 p= None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line= json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount+= 1 if rcount % 100000 == 0: print(rcount) if rcount % 6 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5) if len(texts) % batchsize == 0: if p != None: p.join() p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts= [] labels= [] if p != None: p.join() self.fit_batch(texts, labels, rcount) if pickle_model!="": with gzip.open(pickle_model, 'wb') as model_file: pkl.dump((self.wb, self.clf), model_file, protocol=2) def predict(self, texts): vecs= self.wb.transform(texts) return self.clf.predict(vecs)
def __init__(self, pickle_model="", datadir=None): self.wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 3, "hash_ngrams_weights": [-1.0, -1.0, 1.0], "hash_size": 2**23, "norm": 'l2', "tf": 'binary', "idf": 50.0 })) self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**23, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
class WordbagRegressor(object): def __init__(self, pickle_model="", datadir=None): self.wordbatch = wordbatch.WordBatch(normalize_text, extractors=[(wordbatch.WordBag, {"hash_ngrams":3, "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0})]) self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity") if datadir==None: (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, u'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels= self.wordbatch.shuffle_batch(texts, labels, rcount) print "Transforming", rcount texts= self.wordbatch.transform(texts) print "Training", rcount self.clf.fit(texts, labels) def train(self, datadir, pickle_model=""): texts= [] labels= [] training_data= os.listdir(datadir) rcount= 0 batchsize= 100000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, u'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount+= 1 if rcount % 100000 == 0: print rcount if rcount % 7 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5) if len(texts) % batchsize == 0: if p != None: p.join() p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts= [] labels= [] if p != None: p.join() self.fit_batch(texts, labels, rcount) self.wordbatch.dictionary_freeze= True if pickle_model!="": with gzip.open(pickle_model, u'wb') as model_file: pkl.dump((self.wordbatch, self.clf), model_file, protocol=2) def predict(self, texts): counts= self.wordbatch.transform(texts) return self.clf.predict(counts)
def __init__(self, param_dict, feature_dim): alpha = param_dict['alpha'] beta = param_dict['beta'] L1 = param_dict['L1'] L2 = param_dict['L2'] iters = param_dict['iters'] self.model = FTRL(alpha=alpha, beta=beta, L1=L1, L2=L2, D=feature_dim, iters=iters, inv_link="identity", threads=6)
def __init__(self, spar_type, spar_penalty): # We create a separate model for each action in the environment's # action space. Alternatively we could somehow encode the action # into the features, but this way it's easier to code up. self.models = [] for _ in range(env.action_space.n): #model=Lasso(alpha=0.01) model = SGDRegressor(learning_rate='constant', penalty=spar_type, l1_ratio=spar_penalty, max_iter=1000) model1 = PassiveAggressiveRegressor() model2 = Lasso(alpha=0.1, normalize=True, warm_start=True) model3 = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**25, iters=1) #l2,l1,none,elasticnet #,penalty='l1',l1_ratio=0) #learning_rate="constant" # We need to call partial_fit once to initialize the model # or we get a NotFittedError when trying to make a prediction # This is quite hacky. #model2.fit([self.featurize_state(env.reset())], [0]) #X = np.array([self.featurize_state(env.reset())]) #Y = np.array([0]) #print X.shape, Y.shape #model.partial_fit(X,Y) model.partial_fit([self.featurize_state(env.reset())], [0]) self.models.append(model)
def __init__(self, pickle_model="", datadir=None): from pyspark import SparkContext self.sc= SparkContext() self.wordbatch = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams":3, "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0})) self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity") self.wordbatch.use_sc= True if datadir==None: (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
class vanila_FTRL_Regressor: def __init__(self, param_dict, feature_dim): alpha = param_dict['alpha'] beta = param_dict['beta'] L1 = param_dict['L1'] L2 = param_dict['L2'] iters = param_dict['iters'] self.model = FTRL(alpha=alpha, beta=beta, L1=L1, L2=L2, D=feature_dim, iters=iters, inv_link="identity", threads=6) def fit(self, X_train, y_train): self.model.fit(X_train, y_train) def predict(self, X_valid): return self.model.predict(X_valid)
def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch(normalize_text=normalize_text, extractor=WordBag( hash_ngrams=3, hash_ngrams_weights=[-1.0, -1.0, 1.0], hash_size=2**23, norm='l2', tf='binary', idf=50.0), batcher=batcher) self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**23, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None): self.wb= wordbatch.WordBatch(normalize_text, extractor=(Hstack, [(WordVec, {"wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", "normalize_text": normalize_text}), (WordVec, {"wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz", "normalize_text": normalize_text})])) self.wb.dictionary_freeze= True self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link= "identity") if datadir==None: (self.wb, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch( normalize_text, extractor=Hstack([ WordVec(wordvec_file= "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", normalize_text=normalize_text, encoding="utf8"), WordVec( wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", normalize_text=normalize_text, encoding="utf8") ])) # from wordbatch.pipelines import FeatureUnion # from wordbatch.transformers import Dictionary, TextNormalizer # from sklearn.pipeline import Pipeline # tn= TextNormalizer(normalize_text=normalize_text) # dct= Dictionary() # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))]) self.batcher = batcher self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=100 + 50, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def FTRL_train(train_X, train_y, isQuickRun): if isQuickRun: model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=train_X.shape[1], iters=9, inv_link="identity", threads=4) else: model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=train_X.shape[1], iters=47, inv_link="identity", threads=4) model.fit(train_X, train_y) return model
gc.collect() # Take the log of the target y = np.log1p(target) if enable_validation: truth_sr = np.log1p(truth) del train, target gc.collect() FTRL_model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=X.shape[1], iters=50, inv_link="identity", threads=1) FTRL_model.fit(X, y) print("[{}] Train FTRL completed".format(time.time() - start_time)) FM_FTRL_model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200,
aver_rmse = 0.0 for train_index, val_index in kf.split(y): fold_id += 1 print("Fold {} start...".format(fold_id)) train_X, valid_X = X[train_index], X[val_index] train_y, valid_y = y[train_index], y[val_index] # train_X, valid_X, train_y, valid_y = train_test_split(X_train, y, test_size=0.2, random_state=42) # del X_train, y d_shape = train_X.shape[1] print('d_shape', d_shape) model = FTRL(alpha=0.01, beta=0.1, L1=0.1, L2=10, D=d_shape, iters=5, inv_link="identity", threads=8) model.fit(train_X, train_y) def rmse(predictions, targets): print("calculating RMSE ...") return np.sqrt(((predictions - targets)**2).mean()) preds_valid_ftrl = model.predict(X=valid_X) # print(" FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds_valid_ftrl))) print(" FTRL dev RMSLE:", rmse(valid_y, preds_valid_ftrl)) # # model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=X_train.shape[1], alpha_fm=0.01, L2_fm=0.0, # # init_fm=0.01, # # D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4)
#from sklearn.feature_extraction.text import HashingVectorizer #from sklearn.linear_model import * #vct= HashingVectorizer() #clf= SGDRegressor() import wordbatch from wordbatch.models import FTRL from wordbatch.extractors import WordBag wb= wordbatch.WordBatch(extractor=(WordBag, {"hash_ngrams":2, "hash_ngrams_weights":[0.5, -1.0], "hash_size":2**23, "norm":'l2', "tf":'log', "idf":50.0})) clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1) train_texts= ["Cut down a tree with a herring? It can't be done.", "Don't say that word.", "How can we not say the word if you don't tell us what it is?"] train_labels= [1, 0, 1] test_texts= ["Wait! I said it! I said it! Ooh! I said it again!"] values = wb.transform(train_texts) clf.fit(values, train_labels) preds= clf.predict(wb.transform(test_texts)) print("values={}".format(values)) print("values={}".format(len(values))) print("texts={}".format(test_texts)) print("transformed={}".format(wb.transform(test_texts))) print(preds)
def get_pred_ftrl(submission): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: train = pd.read_table( '../input/mercari-price-suggestion-challenge/train.tsv', engine='c') test = pd.read_table( '../input/mercari-price-suggestion-challenge/test.tsv', engine='c') #train = pd.read_table('../input/train.tsv', engine='c') #test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] train = train[train["price"] != 0] #Xtrain,Xvalid = train_test_split(train, test_size=0.01,random_state=1) nrow_train = train.shape[0] #nrow_valid = Xvalid.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, test]) #submission: pd.DataFrame = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_train:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y #''' if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) submission['price_FTRL'] = predsF #print(rmsle(np.expm1(predsF),y_valid)) #''' print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) submission['price_FM_FTRL'] = predsFM
class WordvecRegressor(object): def __init__(self, pickle_model="", datadir=None): self.wb = wordbatch.WordBatch( normalize_text, extractor=(Hstack, [ (WordVec, { "wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", "normalize_text": normalize_text }), (WordVec, { "wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz", "normalize_text": normalize_text }) ])) self.wb.dictionary_freeze = True self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**25, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels = self.wb.shuffle_batch(texts, labels, rcount) print("Transforming", rcount) texts = self.wb.transform(texts) print("Training", rcount) self.clf.fit(texts, labels) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 batchsize = 100000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 6 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p != None: p.join() p = threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts = [] labels = [] if p != None: p.join() self.fit_batch(texts, labels, rcount) if pickle_model != "": with gzip.open(pickle_model, 'wb') as model_file: pkl.dump((self.wb, self.clf), model_file, protocol=2) def predict(self, texts): vecs = self.wb.transform(texts) return self.clf.predict(vecs)
def main(): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: ###train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') ###test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') train = pd.read_table('../input/train.tsv', engine='c') test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) submission = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 4, 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 5, 'feature_fraction': 0.6, 'nthread': 4, 'min_data_in_leaf': 100, 'max_bin': 31 } # Remove features with document frequency <=100 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] if develop: d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) if develop: preds = model.predict(valid_X) print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsL = model.predict(X_test) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5) submission['price'] = np.expm1(preds) submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
def main(): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') #train = pd.read_table('../input/train.tsv', engine='c') #test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) submission: pd.DataFrame = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(merge['name']) del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True X_description = wb.fit_transform(merge['item_description']) del(wb) X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 4, 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 5, 'feature_fraction': 0.6, 'nthread': 4, 'min_data_in_leaf': 100, 'max_bin': 31 } # Remove features with document frequency <=100 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] if develop: d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) if develop: preds = model.predict(valid_X) print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsL = model.predict(X_test) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5) submission['price'] = np.expm1(preds) submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)
class WordbagRegressor(object): def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch(normalize_text=normalize_text, extractor=WordBag( hash_ngrams=3, hash_ngrams_weights=[-1.0, -1.0, 1.0], hash_size=2**23, norm='l2', tf='binary', idf=50.0), batcher=batcher) self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2**23, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels = self.wb.batcher.shuffle_batch(texts, labels, rcount) print("Transforming", rcount) texts = self.wb.fit_transform(texts, reset=False) print("Training", rcount) self.clf.fit(texts, labels, reset=False) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 batchsize = 100000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 7 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p != None: p.join() p = threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts = [] labels = [] if p != None: p.join() self.fit_batch(texts, labels, rcount) self.wb.dictionary_freeze = True if pickle_model != "": with gzip.open(pickle_model, 'wb') as model_file: backend = self.wb.batcher.backend backend_handle = self.wb.batcher.backend_handle self.wb.batcher.backend = "serial" self.wb.batcher.backend_handle = None pkl.dump((self.wb, self.clf), model_file, protocol=2) self.wb.batcher.backend = backend self.wb.batcher.backend_handle = backend_handle def predict(self, texts): counts = self.wb.transform(texts) return self.clf.predict(counts)
class WordbagRegressor(object): def __init__(self, pickle_model="", datadir=None): from pyspark import SparkContext self.sc= SparkContext() self.wordbatch = wordbatch.WordBatch(normalize_text, backend="spark", backend_handle=self.sc, extractor=(WordBag, {"hash_ngrams":3, "hash_ngrams_weights":[-1.0, -1.0, 1.0], "hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0})) self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity") if datadir==None: (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): print("Transforming", rcount) # if self.sc != None: # data_rdd= self.wordbatch.lists2rddbatches([texts, labels], self.sc) # data_rdd= self.wordbatch.transform(data_rdd) # [texts, labels]= self.wordbatch.rddbatches2lists(data_rdd) # else: # print(texts[:2]) # print(pd.Series(labels).value_counts()) texts= self.wordbatch.partial_fit_transform(texts) print("Training", rcount) self.clf.partial_fit(texts, labels) def train(self, datadir, pickle_model=""): texts= [] labels= [] training_data= os.listdir(datadir) rcount= 0 batchsize= 20000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount+= 1 if rcount % 100000 == 0: print(rcount) if rcount % 7 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5) if len(texts) % batchsize == 0: if p != None: p.join() p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts= [] labels= [] if p != None: p.join() self.fit_batch(texts, labels, rcount) self.wordbatch.dictionary_freeze= True if pickle_model!="": with gzip.open(pickle_model, 'wb') as model_file: pkl.dump((self.wordbatch, self.clf), model_file, protocol=2) def predict(self, texts): # if self.sc != None: # data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc) # data_rdd= self.wordbatch.transform(data_rdd) # [counts, labels]= self.wordbatch.rddbatches2lists(data_rdd) # else: counts= self.wordbatch.transform(texts) return self.clf.predict(counts) def predict_parallel(self, texts): # if self.sc != None: # data_rdd= self.wordbatch.lists2rddbatches([texts, []] , self.sc) # counts_rdd= self.wordbatch.transform(data_rdd) # return self.wordbatch.rddbatches2lists(self.wordbatch.predict_parallel(counts_rdd, self.clf))[0] counts= self.wordbatch.transform(texts) return self.wordbatch.predict_parallel(counts, self.clf)
def main(test, logger): logger.info('Start . . .') train = pd.read_table('../input/train.tsv', engine='c') logger.info('Load train') logger.info('train shape {}'.format(train.shape)) logger.info('test shape {}'.format(test.shape)) nrow_train = train.shape[0] y = np.log1p(train['price']) train_low_price = train.loc[train['price'] < 1.] train = train.drop(train[train['price'] < 1.].index) del train_low_price['price'] logger.info('train_low_price shape {}'.format(train_low_price.shape)) df_full = pd.concat([train, train_low_price, test]) logger.info('df_full shape {}'.format(df_full.shape)) sub = test[['test_id']] logger.info('sub shape {}'.format(sub.shape)) del train, test gc.collect() df_full['general_cat'], df_full['subcat_1'], df_full['subcat_2'] = zip( *df_full['category_name'].apply(lambda x: split_category(x))) df_full.drop(['category_name'], axis=1, inplace=True) logger.info('Split category_name') gc.collect() df_full = impute_missing_value(df_full) logger.info('Impute missing value') gc.collect() df_full = cut_df(df_full) logger.info('Cut categories') gc.collect() df_full = to_categorical(df_full) logger.info('Convert to categorical features') gc.collect() wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(df_full['name']) del wb gc.collect() X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] logger.info('Vectorize name') gc.collect() cnt_vec = CountVectorizer() X_cat_1 = cnt_vec.fit_transform(df_full['general_cat']) X_cat_2 = cnt_vec.fit_transform(df_full['subcat_1']) X_cat_3 = cnt_vec.fit_transform(df_full['subcat_2']) df_full.drop(['general_cat', 'subcat_1', 'subcat_2'], axis=1, inplace=True) del cnt_vec gc.collect() logger.info('Vectorize category (general_cat, subcat_1, subcat_2)') wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**29, "norm": "l2", "tf": 1.0, "idf": None }), procs=2) wb.dictionary_freeze = True X_description = wb.fit_transform(df_full['item_description']) del wb gc.collect() X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] logger.info('Vectorize item_description') gc.collect() lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(df_full['brand_name']) df_full.drop(['brand_name'], axis=1, inplace=True) del lb gc.collect() logger.info('Label binarize brand_name') X_dummies = csr_matrix( pd.get_dummies(df_full[['item_condition_id', 'shipping']], sparse=True).values) df_full.drop(['item_condition_id', 'shipping'], axis=1, inplace=True) logger.info('Get dummies on item_condition_id and shipping') gc.collect() sparse_merge = hstack((X_dummies, X_description, X_brand, X_cat_1, X_cat_2, X_cat_3, X_name)).tocsr() logger.info('Create sparse features') logger.info('sparse_merge shape {}'.format(sparse_merge.shape)) del X_dummies, X_description, X_brand, X_cat_1, X_cat_2, X_cat_3, X_name gc.collect() mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] logger.info('Remove features with doc frequency <= 1') logger.info('sparse_merge shape {}'.format(sparse_merge.shape)) X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_train:] sparse_merge_shape = sparse_merge.shape del sparse_merge gc.collect() model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge_shape[1], iters=30, inv_link="identity", threads=1) model.fit(X, y) logger.info('Fit FTRL') preds_FTRL = model.predict(X_test) logger.info('Predict FTRL') model = FM_FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=0.1, D=sparse_merge_shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=20, inv_link="identity", threads=4) model.fit(X, y) logger.info('Fit FM_FTRL') preds_FM_FTRL = model.predict(X_test) logger.info('Predict FM_FTRL') preds = (np.expm1(preds_FTRL) * 0.15 + np.expm1(preds_FM_FTRL) * 0.85) logger.info('Final predictions generated') return preds
def main(): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: train = pd.read_table( '../input/mercari-price-suggestion-challenge/train.tsv', engine='c') test = pd.read_table( '../input/mercari-price-suggestion-challenge/test.tsv', engine='c') #train = pd.read_table('../input/train.tsv', engine='c') #test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) submission: pd.DataFrame = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=30, inv_link="identity", threads=1) del X gc.collect() model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.012, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) model.fit(train_X, train_y) del train_X, train_y gc.collect() print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) del X_test gc.collect() params = { 'learning_rate': 0.65, 'application': 'regression', 'max_depth': 4, 'num_leaves': 42, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.71, 'bagging_freq': 5, 'feature_fraction': 0.67, 'nthread': 4, 'min_data_in_leaf': 120, 'max_bin': 40 } # Remove features with document frequency <=100 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) del sparse_merge gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) del X, y gc.collect() d_train = lgb.Dataset(train_X, label=train_y) # del train_X, train_y; gc.collect() watchlist = [d_train] if develop: d_valid = lgb.Dataset(valid_X, label=valid_y) del valid_y gc.collect() watchlist = [d_train, d_valid] #model = lgb.train(params, train_set=d_train, num_boost_round=7500, valid_sets=watchlist, \ # early_stopping_rounds=1000, verbose_eval=1000) model = lgb.train(params, train_set=d_train, num_boost_round=3000, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) del d_train gc.collect() if develop: preds = model.predict(valid_X) del valid_X gc.collect() print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsL = model.predict(X_test) # del X_test; gc.collect() print('[{}] Predict LGB completed.'.format(time.time() - start_time)) #--- BEGIN Huber # Details: http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.HuberRegressor.html # class sklearn.linear_model.HuberRegressor(epsilon=1.35, # max_iter=100, alpha=0.0001, warm_start=False, fit_intercept=True, # tol=1e-05)[source] setup_Huber = 2 if (setup_Huber == 1): model = HuberRegressor(fit_intercept=True, alpha=0.01, max_iter=80, epsilon=363) if (setup_Huber == 2): model = HuberRegressor(fit_intercept=True, alpha=0.05, max_iter=200, epsilon=1.2) model.fit(train_X, train_y) print('[{}] Predict Huber completed.'.format(time.time() - start_time)) predsH = model.predict(X=X_test) #--- END Huber # original # preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5) # modified setup (IT NEEDS MORE TUNING TESTS) w = (0.09, 0.11, 0.23, 0.57) preds = predsH * w[0] + predsF * w[1] + predsL * w[2] + predsFM * w[3] submission['price'] = np.expm1(preds) submission.to_csv("sub ftrl_fm_lgb_huber v3.csv", index=False) nm = (time.time() - start_time) / 60 print("Total processing time %s min" % nm)
# sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) d_shape = sparse_merge.shape[1] gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.1680, beta=9.7895, L1=0.0011, L2=8.9635, D=d_shape, iters=int(11), inv_link="identity", threads=4) ### del lb del mask del X_name del X_category1 del X_category2 del X_category3 del X del y del merge del X_dummies del X_brand del dftt del X_description
class WordvecRegressor(object): def __init__(self, pickle_model="", datadir=None, batcher=None): self.wb = WordBatch( normalize_text, extractor=Hstack([ WordVec(wordvec_file= "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", normalize_text=normalize_text, encoding="utf8"), WordVec( wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", normalize_text=normalize_text, encoding="utf8") ])) # from wordbatch.pipelines import FeatureUnion # from wordbatch.transformers import Dictionary, TextNormalizer # from sklearn.pipeline import Pipeline # tn= TextNormalizer(normalize_text=normalize_text) # dct= Dictionary() # vec1= WordVec(wordvec_file="../../../data/word2vec/glove.twitter.27B.100d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # vec2= WordVec(wordvec_file="../../../data/word2vec/glove.6B.50d.txt.gz", # normalize_text=normalize_text, encoding="utf8", dictionary= dct) # self.wb = Pipeline(steps= [("tn", tn), ("dct", dct), ("vecs", FeatureUnion([("vec1", vec1), ("vec2", vec2)]))]) self.batcher = batcher self.clf = FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=100 + 50, iters=1, inv_link="identity") if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels = shuffle(texts, labels) print("Transforming", rcount) #texts= self.wb.fit_transform(texts, tn__batcher=self.batcher, dct__reset= False, dct__batcher= self.batcher) texts = self.wb.fit_transform(texts) print("Training", rcount) self.clf.fit(texts, labels, reset=False) def train(self, datadir, pickle_model=""): texts = [] labels = [] training_data = os.listdir(datadir) rcount = 0 batchsize = 80000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount += 1 if rcount % 100000 == 0: print(rcount) if rcount % 6 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append( (float(review["Ratings"]["Overall"]) - 3) * 0.5) if len(texts) % batchsize == 0: if p != None: p.join() p = threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts = [] labels = [] if p != None: p.join() self.fit_batch(texts, labels, rcount) # if pickle_model!="": # with gzip.open(pickle_model, 'wb') as model_file: # backend = self.wb.batcher.backend # backend_handle = self.wb.batcher.backend_handle # self.wb.batcher.backend = "serial" # self.wb.batcher.backend_handle = None # pkl.dump((self.wb, self.clf), model_file, protocol=2) # self.wb.batcher.backend = backend # self.wb.batcher.backend_handle = backend_handle def predict(self, texts): vecs = self.wb.transform(texts) return self.clf.predict(vecs)
def wordbatch_algo(test): import time print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: # train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') # test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') train = pd.read_table('../input/train.tsv', engine='c') # test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) # submission: pd.DataFrame = test[['test_id']] ''' # Mean of each group # https://stackoverflow.com/questions/30244952/python-pandas-create-new-column-with-groupby-sum cat_mean = train['price'].groupby(train['category_name']).mean() cat_mean = pd.DataFrame({'category_name':cat_mean.index, 'cat_mean':cat_mean.values}) merge = merge.merge(cat_mean, on=['category_name'], how='left') # print(merge.head()) X_cat_mean = merge['cat_mean'].as_matrix().reshape(-1, 1) # X_cat_mean = normalize(np.nan_to_num(X_cat_mean).reshape(-1, 1), norm='max') cond_mean = train['price'].groupby(train['item_condition_id']).mean() cond_mean = pd.DataFrame({'item_condition_id':cond_mean.index, 'cond_mean':cond_mean.values}) merge = merge.merge(cond_mean, on=['item_condition_id'], how='left') X_cond_mean = merge['cond_mean'].as_matrix().reshape(-1, 1) brand_mean = train['price'].groupby(train['brand_name']).mean() brand_mean = pd.DataFrame({'brand_name':brand_mean.index, 'brand_mean':brand_mean.values}) merge = merge.merge(brand_mean, on=['brand_name'], how='left') X_brand_mean = merge['brand_mean'].as_matrix().reshape(-1, 1) ship_mean = train['price'].groupby(train['shipping']).mean() ship_mean = pd.DataFrame({'shipping':ship_mean.index, 'ship_mean':ship_mean.values}) merge = merge.merge(ship_mean, on=['shipping'], how='left') X_ship_mean = merge['ship_mean'].as_matrix().reshape(-1, 1) ''' del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc = merge['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name = merge['name'].apply(lambda x: len(x)).as_matrix().reshape( -1, 1) # X_len_description = normalize(np.nan_to_num(X_len_description).reshape(-1, 1), norm='max') # X_len_name = normalize(np.nan_to_num(X_len_name).reshape(-1, 1), norm='max') print('[{}] Length `item_description` completed.'.format(time.time() - start_time)) print('[{}] Length `name` completed.'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print( X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape ) #, X_glove.shape, X_len_description.shape, X_len_name.shape, X_cat_mean.shape) # sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=FM_iter, inv_link="identity", threads=4) model.fit(train_X, train_y) gc.collect() print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) gc.collect() print(predsFM) #model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=FTRL_iter, inv_link="identity", threads=1) del X gc.collect() model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) print(predsF) del train_X, train_y del X_test return predsFM, predsF
del(wb) X_description = X_description[:, np.where(X_description.getnnz(axis=0) > 1)[0]] lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['Source']) X_dummies = csr_matrix(pd.get_dummies(merge[['IDLink', 'Facebook']], sparse=True).values) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_name)).tocsr() sparse_merge = sparse_merge[:, np.where(sparse_merge.getnnz(axis=0) > 100)[0]] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test)
use_avx=1, verbose=0) # iters changed to 2 from 116_7 # threads=4, use_avx=1, verbose=0) elif wordbatch_model == 'NN_ReLU_H1': clf = NN_ReLU_H1(alpha=0.05, D=D, verbose=9, e_noise=0.0, threads=4, inv_link="sigmoid") elif wordbatch_model == 'FTRL': clf = FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, iters=3, threads=4, verbose=9) dtypes = { 'ip': 'uint32', 'app': 'uint16', 'device': 'uint16', 'os': 'uint16', 'channel': 'uint16', 'is_attributed': 'uint8', } p = None rcount = 0
class WordbagRegressor(object): def __init__(self, pickle_model="", datadir=None): from pyspark import SparkContext self.sc= SparkContext() self.wordbatch = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams":3, "hash_ngrams_weights":[-1.0, -1.0, 1.0],"hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0})) self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1, inv_link="identity") self.wordbatch.use_sc= True if datadir==None: (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model) def fit_batch(self, texts, labels, rcount): texts, labels= self.wordbatch.shuffle_batch(texts, labels, rcount) print("Transforming", rcount) if self.sc != None: data_rdd= self.wordbatch.lists2rddbatches([texts, labels], self.sc) data_rdd= self.wordbatch.transform(data_rdd) [texts, labels]= self.wordbatch.rddbatches2lists(data_rdd) else: texts= self.wordbatch.transform(texts) print("Training", rcount) self.clf.fit(texts, labels) def train(self, datadir, pickle_model=""): texts= [] labels= [] training_data= os.listdir(datadir) rcount= 0 batchsize= 20000 p = None for jsonfile in training_data: with open(datadir + "/" + jsonfile, 'r') as inputfile: for line in inputfile: #if rcount > 1000000: break try: line = json.loads(line.strip()) except: continue for review in line["Reviews"]: rcount+= 1 if rcount % 100000 == 0: print(rcount) if rcount % 7 != 0: continue if "Overall" not in review["Ratings"]: continue texts.append(review["Content"]) labels.append((float(review["Ratings"]["Overall"]) - 3) *0.5) if len(texts) % batchsize == 0: if p != None: p.join() p= threading.Thread(target=self.fit_batch, args=(texts, labels, rcount)) p.start() texts= [] labels= [] if p != None: p.join() self.fit_batch(texts, labels, rcount) self.wordbatch.dictionary_freeze= True if pickle_model!="": with gzip.open(pickle_model, 'wb') as model_file: pkl.dump((self.wordbatch, self.clf), model_file, protocol=2) def predict(self, texts): if self.sc != None: data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc) data_rdd= self.wordbatch.transform(data_rdd) [counts, labels]= self.wordbatch.rddbatches2lists(data_rdd) else: counts= self.wordbatch.transform(texts) return self.clf.predict(counts) def predict_parallel(self, texts): if self.sc != None: data_rdd= self.wordbatch.lists2rddbatches([texts, []], self.sc) counts_rdd= self.wordbatch.transform(data_rdd) return self.wordbatch.rddbatches2lists(self.wordbatch.predict_parallel(counts_rdd, self.clf))[0] counts= self.wordbatch.transform(texts) return self.wordbatch.predict_parallel(counts, self.clf)
def wordbatch_algo(): import time # print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) train = pd.read_table('../input/train.tsv', engine='c') # Drop rows where price = 0 train = train[train.price != 0].reset_index(drop=True) print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) y = np.log1p(train["price"]) nrow_train = train.shape[0] # Training train['general_cat'], train['subcat_1'], train['subcat_2'] = \ zip(*train['category_name'].apply(lambda x: split_cat(x))) train.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(train) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(train) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(train) print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc = train['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name = train['name'].apply(lambda x: len(x)).as_matrix().reshape( -1, 1) print('[{}] Length of text completed.'.format(time.time() - start_time)) # Name wb_name = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb_name.dictionary_freeze = True wb_name.fit(train['name']) X_name = wb_name.transform(train['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb_cat1 = CountVectorizer() wb_cat2 = CountVectorizer() wb_cat3 = CountVectorizer() wb_cat1.fit(train['general_cat']) wb_cat2.fit(train['subcat_1']) wb_cat3.fit(train['subcat_2']) X_category1 = wb_cat1.transform(train['general_cat']) X_category2 = wb_cat2.transform(train['subcat_1']) X_category3 = wb_cat3.transform(train['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb_desc = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb_desc.dictionary_freeze = True wb_desc.fit(train['item_description']) X_description = wb_desc.transform(train['item_description']) # X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) lb.fit(train['brand_name']) X_brand = lb.transform(train['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_cond, d_cond = fit_dummy(train['item_condition_id'].tolist()) X_ship, d_ship = fit_dummy(train['shipping'].tolist()) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) del train gc.collect() print(X_cond.shape, X_ship.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_cond, X_ship, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_description, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] print(sparse_merge.shape) X = sparse_merge # --------------------------------------- # FM model fit train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=FM_iter, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) print('-' * 20) if develop: preds = model.predict(X=valid_X) print("->>>> FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # --------------------------------------- # FTRL model fit model2 = FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=1.0, D=train_X.shape[1], iters=FTRL_iter, inv_link="identity", threads=1) # del X; gc.collect() model2.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model2.predict(X=valid_X) print("->>>> FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # Clear variables: del X, train_X, train_y, sparse_merge gc.collect() # --------------------------------------- # Testing by chunk print(' FM/FTRL: ...reading the test data...') predsFM = [] predsF = [] for test in load_test(): test['general_cat'], test['subcat_1'], test['subcat_2'] = \ zip(*test['category_name'].apply(lambda x: split_cat(x))) test.drop('category_name', axis=1, inplace=True) handle_missing_inplace(test) #print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(test) # print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(test) # print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc_test = test['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name_test = test['name'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_name_test = wb_name.transform(test['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_category1_test = wb_cat1.transform(test['general_cat']) X_category2_test = wb_cat2.transform(test['subcat_1']) X_category3_test = wb_cat3.transform(test['subcat_2']) X_description_test = wb_desc.transform(test['item_description']) # X_description_test = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_brand_test = lb.transform(test['brand_name']) X_cond_test = transform_dummy(test['item_condition_id'].tolist(), d_cond) X_ship_test = transform_dummy(test['shipping'].tolist(), d_ship) X_test = hstack((X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, \ X_category2_test, X_category3_test, X_name_test)).tocsr() X_test = X_test[:, mask] # Clear variables: del X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, X_category2_test, X_category3_test, X_name_test del test gc.collect() predsFM_batch = model.predict(X_test) predsFM += np.array(predsFM_batch).flatten().tolist() predsF_batch = model2.predict(X_test) predsF += np.array(predsF_batch).flatten().tolist() print(np.array(predsFM)) print('-' * 20) print(np.array(predsF)) print('-' * 20) return np.array(predsFM), np.array(predsF)
def main(): train = pd.read_table('../input/train.tsv', engine='c') test = pd.read_table('../input/test.tsv', engine='c') print('Finished to load data') nrow_test = train.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) submission: pd.DataFrame = test[['test_id']] del train, test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('Split categories completed.') handle_missing_inplace(merge) print('Handle missing completed.') cutting(merge) print('Cut completed.') to_categorical(merge) print('Convert categorical completed') cv = CountVectorizer(min_df=NAME_MIN_DF) X_name_cv = cv.fit_transform(merge['name']) cv = CountVectorizer() X_category1_cv = cv.fit_transform(merge['general_cat']) X_category2_cv = cv.fit_transform(merge['subcat_1']) X_category3_cv = cv.fit_transform(merge['subcat_2']) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { 'hash_ngrams': 2, 'hash_ngrams_weights': [1.5, 1.0], 'hash_size': 2**29, 'norm': None, 'tf': 'binary', 'idf': None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del wb X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('Vectorize `name` completed.') wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('Count vectorize `categories` completed.') wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { 'hash_ngrams': 2, 'hash_ngrams_weights': [1.0, 1.0], 'hash_size': 2**28, 'norm': 'l2', 'tf': 1.0, 'idf': None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del wb X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('Vectorize `item_description` completed.') lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('Label binarize `brand_name` completed.') X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('Get dummies on `item_condition_id` and `shipping` completed.') num_chars = merge['item_description'].apply(lambda x: len(x)).values num_words = merge['item_description'].apply( lambda x: len(x.split(' '))).values num_upper = merge['item_description'].apply( lambda x: len(re.findall('[A-Z]+', x))).values num_chars = num_chars / max(num_chars) num_words = num_words / max(num_words) num_upper = num_upper / max(num_upper) X_feature = np.vstack([num_chars, num_words, num_upper]).T print('musicmilif features completed.') sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_category1_cv, X_category2_cv, X_category3_cv, X_name_cv, X_feature)).tocsr() print('Create sparse merge completed') del X_dummies, X_description, X_brand, X_category1, X_category2, X_category3 del X_name, X_category1_cv, X_category2_cv, X_category3_cv, X_name_cv, X_feature del num_chars, num_words, num_upper gc.collect() # Remove features with document frequency <=1 mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] gc.collect() train_X, train_y = X, y model = Ridge(solver='auto', fit_intercept=True, alpha=5.0, max_iter=100, normalize=False, tol=0.05) model.fit(train_X, train_y) print('Train Ridge completed') predsR = model.predict(X_test) print('Predict Ridge completed') model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('Train FTRL completed') predsF = model.predict(X_test) print('Predict FTRL completed') model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=17, inv_link="identity", threads=4) model.fit(train_X, train_y) print('Train FM_FTRL completed') predsFM = model.predict(X_test) print('Predict FM_FTRL completed') params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 9, 'num_leaves': 24, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.9, 'bagging_freq': 6, 'feature_fraction': 0.8, 'nthread': 4, 'min_data_in_leaf': 51, 'max_bin': 64 } # Remove features with document frequency <=200 mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 200, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] train_X, train_y = X, y d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] model = lgb.train(params, train_set=d_train, num_boost_round=1800, valid_sets=watchlist, early_stopping_rounds=500, verbose_eval=400) predsL = model.predict(X_test) print('Predict LGBM completed') preds = (predsR * 1 + predsF * 1 + predsFM * 16 + predsL * 6) / (1 + 1 + 16 + 6) submission['price'] = np.expm1(preds) submission.to_csv("submission.csv", index=False)