def __init__(self, pickle_model="", datadir=None): from pyspark import SparkContext self.sc= SparkContext() self.wordbatch = wordbatch.WordBatch(normalize_text, backend="spark", backend_handle=self.sc, extractor=(WordBag, {"hash_ngrams":3, "hash_ngrams_weights":[-1.0, -1.0, 1.0], "hash_size":2**23, "norm":'l2', "tf":'binary', "idf":50.0})) self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity") if datadir==None: (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None): self.maxlen = 100 self.n_words = 100000 parser = NeonArgparser(__doc__) self.args = parser.parse_args() self.args.batch_size = self.batch_size = 2048 # self.args.deterministic = None self.args.rng_seed = 0 print extract_valid_args(self.args, gen_backend) self.be = gen_backend(**extract_valid_args(self.args, gen_backend)) embedding_dim = 100 init_emb = Uniform(-0.1 / embedding_dim, 0.1 / embedding_dim) init_glorot = GlorotUniform() self.layers = [ LookupTable(vocab_size=self.n_words, embedding_dim=embedding_dim, init=init_emb, pad_idx=0, update=True, name="LookupTable"), Dropout(keep=0.5), BiLSTM(100, init=init_glorot, activation=Tanh(), gate_activation=Logistic(), reset_cells=True, split_inputs=False, name="BiLSTM"), RecurrentMean(), Affine(1, init_glorot, bias=init_glorot, activation=Identity(), name="Affine") ] self.wordbatch = wordbatch.WordBatch(normalize_text, n_words=self.n_words, extractors=[(wordbatch.WordSeq, { "seq_maxlen": self.maxlen })]) if datadir == None: self.model = Model(self.layers) self.model.load_params(pickle_model) self.wordbatch = pkl.load(gzip.open(pickle_model + ".wb", 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None): self.wb = wordbatch.WordBatch(normalize_text, extractor=(WordHash, { "decode_error": 'ignore', "n_features": 2**25, "non_negative": False, "ngram_range": (1, 2), "norm": 'l2' })) self.clf = Ridge(alpha=1.0, random_state=0) if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None): self.wb = wordbatch.WordBatch(normalize_text, stemmer=stemmer, extractor=(WordHash, { "decode_error": 'ignore', "n_features": 2**25, "non_negative": False, "ngram_range": (1, 2), "norm": 'l2' })) self.clf = FM_FTRL(D=2**25, D_fm=4, iters=1, inv_link="identity", threads=multiprocessing.cpu_count() // 2) if datadir == None: (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb')) else: self.train(datadir, pickle_model)
def __init__(self, pickle_model="", datadir=None): seed = 10002 session_conf = tf.ConfigProto( intra_op_parallelism_threads=multiprocessing.cpu_count() // 2, inter_op_parallelism_threads=1) os.environ['PYTHONHASHSEED'] = str(seed) np.random.seed(seed + 1) random.seed(seed + 2) tf.set_random_seed(seed + 3) K.set_session( tf.Session(graph=tf.get_default_graph(), config=session_conf)) self.maxlen = 200 self.max_words = 20000 self.wb = wordbatch.WordBatch(normalize_text, max_words=self.max_words, extractor=(WordSeq, { "seq_maxlen": self.maxlen })) self.model = Sequential() self.model.add( Embedding(self.max_words + 2, 20, input_length=self.maxlen)) self.model.add( Conv1D(activation="relu", padding="same", strides=1, filters=10, kernel_size=3)) self.model.add(Dropout(0.5)) self.model.add(BatchNormalization()) self.model.add(GlobalMaxPooling1D()) self.model.add(Dense(1)) self.model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error']) if datadir == None: self.model = load_model(pickle_model) self.wb = pkl.load(gzip.open(pickle_model + ".wb", 'rb')) else: self.train(datadir, pickle_model)
def normalize_text(text): text = text.lower() text = nums_re.sub(" NUM ", text) text = " ".join([ word for word in non_alphanums.sub(" ", text).strip().split() if len(word) > 1 ]) return text maxlen = 200 max_words = 20000 wb = wordbatch.WordBatch(normalize_text, max_words=max_words, extractor=(WordSeq, { "seq_maxlen": maxlen })) wb = wordbatch.WordBatch( normalize_text, extractor=(Hstack, [(WordVec, { "wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz", "normalize_text": normalize_text, "encoding": "utf8" }), (WordVec, { "wordvec_file": "../../../data/word2vec/glove.6B.50d.txt.gz", "normalize_text": normalize_text, "encoding": "utf8"
class WBFmFtrlModel(object): wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True}) , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def __init__(self,train_files): self.train_files = train_files def predict(self,predict_file): p = None test_preds = [] click_ids = [] X = None for df_c in pd.read_csv(predict_file,engine='c',chunksize=batchsize,sep=","): str_array, labels, weights = df2csr(self.wb,df_c) click_ids+= df_c['click_id'].tolist() del(df_c) if p != None: test_preds += list(p.join()) if X is not None: del (X) X = None gc.collect() X = self.wb.transform(str_array) del (str_array) p = ThreadWithReturnValue(target=predict_batch, args=(self.clf, X)) p.start() if p != None: test_preds += list(p.join()) del(X) return click_ids, test_preds def train(self): p = None X = None rcount = 0 for train_file in self.train_files: print("Train using file:{}".format(train_file)) for df_c in pd.read_csv(train_file, engine='c', chunksize=batchsize, #for df_c in pd.read_csv('../input/train.csv', engine='c', chunksize=batchsize, sep=",", dtype=dtypes): rcount += len(df_c) #cpuStats() str_array, labels, weights= df2csr(self.wb, df_c, pick_hours={4, 5, 10, 13, 14}) del(df_c) if p != None: p.join() if X is not None: del(X) X = None gc.collect() X= self.wb.transform(str_array) del(str_array) if rcount % (2 * batchsize) == 0: if p != None: p.join() p = threading.Thread(target=evaluate_batch, args=(self.clf, X, labels, rcount)) p.start() print("Training", rcount, time.time() - start_time) cpuStats() if p != None: p.join() p = threading.Thread(target=fit_batch, args=(self.clf, X, labels, weights)) p.start() if p != None: p.join() del(X) X = None
def main(): feature_vectorized_file_name = 'Data/feature_vectorized2' if os.path.exists(feature_vectorized_file_name) == False: sparse_merge, price = _load(feature_vectorized_file_name) print(sparse_merge.shape) else: ######################################################################## start_time = time.time() merge, submission, price = get_extract_feature() merge = merge[:TRAIN_SIZE] #merge['item_condition_id'] = merge['item_condition_id'].astype('category') # print('[{}] Convert categorical completed'.format(time.time() - start_time)) # # # vectorize features # wb = CountVectorizer() # X_category2 = wb.fit_transform(merge['category_2']) # X_category3 = wb.fit_transform(merge['category_name']) # X_brand2 = wb.fit_transform(merge['brand_name']) # print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # # lb = LabelBinarizer(sparse_output=True) # X_brand = lb.fit_transform(merge['brand_name']) # X_category1 = lb.fit_transform(merge['category_1']) # X_category4 = lb.fit_transform(merge['category_name']) # print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) # # X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) # # # hand feature # for col in merge.columns: # if ('Len' in col) or ('Frec' in col): # merge[col] = np.log1p(merge[col]) # merge[col] = merge[col] / merge[col].max() # # hand_feature = ['brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct', # 'brand_name_item_description_Intsct'] # X_hand_feature = merge[hand_feature].values # name_w1 = param_space_best_WordBatch['name_w1'] name_w2 = param_space_best_WordBatch['name_w2'] desc_w1 = param_space_best_WordBatch['desc_w1'] desc_w2 = param_space_best_WordBatch['desc_w2'] # # wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { # "hash_ngrams": 2, # "hash_ngrams_weights": [name_w1, name_w2], # "hash_size": 2 ** 28, # "norm": None, # "tf": 'binary', # "idf": None, # }), procs=8) # wb.dictionary_freeze = True # X_name = wb.fit_transform(merge['name']) # del (wb) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1), dtype=bool)] # print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) merge['item_description'] = merge['category_2'].map(str)+' E '+\ merge['name'].map(str)+' E '+\ merge['item_description'].map(str) wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 3, "hash_ngrams_weights": [desc_w1, desc_w2, 0.7], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 6, 0, 1), dtype=bool)] print( '[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) print(X_description.shape) sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1, X_category2, X_category3, X_category4, X_hand_feature, X_name, X_description)).tocsr() print(X_dummies.shape, X_brand.shape, X_brand2.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_category4.shape, X_hand_feature.shape, X_name.shape, X_description.shape, sparse_merge.shape) _save(feature_vectorized_file_name, [sparse_merge, price]) print('[{}] data saved.'.format(time.time() - start_time)) ######################################################################## # use hyperopt to find the best parameters of the model # use 3 fold cross validation # learner_name='best_FTRL' # learner_name='FTRL' learner_name = 'best_FM_FTRL' #learner_name='FM_FTRL' print(learner_name) logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name, time_utils._timestamp()) logger = logging_utils._get_logger('Log', logname) logger.info('start') optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger) optimizer.run() a = 12
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ''' ix = (merge['brand_name']==merge['brand_name']) & \ (~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' +merge['name'][ix] ''' #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: wb = CountVectorizer() if 'X_orig' not in locals(): X_orig = wb.fit_transform(merge[col]) else: X_orig = hstack((X_orig, wb.fit_transform(merge[col]))) print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 3, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 100, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' ''' Stemmer ''' # https://github.com/skbly7/usefulness/blob/ed11cd55080d553cf62873999a5e00b154057fbc/textpreprocess.py from nltk.tokenize import WordPunctTokenizer # This is better for sentences containing unicode, like: u"N\u00faria Espert" word_tokenize = WordPunctTokenizer().tokenize import Stemmer import string ps = Stemmer.Stemmer("english") _wsre = re.compile("\s+") _alphanumre = re.compile("[\w\-\' ]", re.UNICODE) def _removestopwords(txtwords): global stoplist # stoplist = stopwords.words("english") if stoplist is None: stoplist = frozenset([string.strip(l) for l in open(STOPFILE).readlines()]) return [[w for w in t if w not in stoplist] for t in txtwords] def _stem(txtwords): return [stemmer.stemWords(t) for t in txtwords] def _removenonalphanumericchars(txtwords): return [[string.join([c for c in w if _alphanumre.search(c) is not None], "") for w in t] for t in txtwords] def _stripallwhitespace(txts): return [_wsre.sub("", txt) for txt in txts] stemmer = Stemmer.Stemmer("english") def textpreprocess(txt, sentencetokenize=False, replacehyphenbyspace=True, wordtokenize=False, lowercase=True, stem=True, removenonalphanumericchars=True, stripallwhitespace=True): """ Note: For html2text, one could also use NCleaner (common.html2text.batch_nclean) Note: One could improve the sentence tokenization, by using the original HTML formatting in the tokenization. Note: We use the Porter stemmer. (Optimization: Shouldn't rebuild the PorterStemmer object each time this function is called.) """ if sentencetokenize: txts = nltk.word_tokenize(txt) #txts = tokenizer.tokenize(txt.split()) else: txts = txt.split() txt = None if replacehyphenbyspace: txts = [t.replace("-", " ") for t in txts] if wordtokenize: txtwords = [word_tokenize(t) for t in txts] else: txtwords = [string.split(t) for t in txts] txts = None if lowercase: txtwords = [[string.lower(w) for w in t] for t in txtwords] if stem: txtwords = _stem(txtwords) # TODO: Maybe remove Unicode accents? http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string if removenonalphanumericchars: txtwords = _removenonalphanumericchars(txtwords) txtwords = [[w for w in t if w != ""] for t in txtwords] txts = [string.join(words) for words in txtwords] if stripallwhitespace: for _ in range(2): txts = _stripallwhitespace(txts) return string.join(txts, sep=" ") print('[{}] Start stemming'.format(time.time() - start_time)) merge['stem_name'] = [textpreprocess(s) for s in merge["name"].values] print('[{}] Stemming completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2'] ) crossed_columns_d = cross_columns(x_cols) categorical_columns = list( merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print ('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash)))%D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del(lb) ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(merge['name']) del(wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del(wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}) , procs=8) wb.dictionary_freeze= True X_description = wb.fit_transform(merge['item_description']) del(wb) X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_stem_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_stem_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[trnidx], y.values[validx] model = FM_FTRL(alpha=0.005, beta=0.005, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.005, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X , train_y , verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
test_ids = kaggle_test_df['id'] test_size = kaggle_test_df.shape[0] if os.path.isfile(df_path): print('Preprocessed file found! Loading preprocessed Dataset') df_full = pd.read_csv(df_path) else: print('No preprocessed file found, start preprocessing') df_full = preprocessing(df, kaggle_test_df) wb = wordbatch.WordBatch(normalize_text , extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [0.5, -1.0], "hash_size": 2 ** 23, "norm": 'l2', "tf": 'log', "idf": 10.0} ) , procs=8) wb.dictionary_freeze = True X_title = wb.transform(df_full['stemmed_title']) # X_title = X_title[:, np.array(np.clip(X_title.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print("Xtitle shape", X_title.shape) X_text = wb.transform(df_full['stemmed_text']) # X_text = X_text[:, np.array(np.clip(X_text.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print("X_text shape", X_text.shape)
class WBFmFtrlModel(object): wb = wordbatch.WordBatch(None, extractor=(WordHash, { "ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True }), minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) #clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, # D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def __init__(self, train_file, test_file): self.train_file = train_file self.test_file = test_file self.clf = None def create_clf(self): if self.clf is not None: del (self.clf) gc.collect() self.clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=16, e_noise=0.0, iters=5, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def predict(self, predict_file): p = None test_preds = [] click_ids = [] X = None for df_c in pd.read_csv(predict_file, engine='c', chunksize=batchsize, sep=",", usecols=predictors + ["click_id", "weight"]): str_array = df2csr(df_c[predictors].values) labels = df_c["click_id"].values weights = df_c["weight"].values click_ids += df_c['click_id'].tolist() del (df_c) if p != None: test_preds += list(p.join()) if X is not None: del (X) gc.collect() X = self.wb.transform(str_array) del (str_array) p = ThreadWithReturnValue(target=predict_batch, args=(self.clf, X)) p.start() if p != None: test_preds += list(p.join()) if X is not None: del (X) gc.collect() return click_ids, test_preds def read_data_file(self, train_file, skip_rows, nrows): if skip_rows > 0: skip_rows = range(1, skip_rows) else: skip_rows = None df_c = pd.read_csv(train_file, skiprows=skip_rows, nrows=nrows, engine="c", dtype=dtypes, usecols=predictors + ["weight", "click_id"]) str_array = df2csr((df_c[predictors].values)) X = self.wb.transform(str_array) labels = df_c["click_id"].values weights = df_c["weight"].values del (str_array) del (df_c) gc.collect() return X, labels, weights def predict_data(self, X, labels, weights): return predict_batch(self.clf, X) def train_all(self): p = None X = None rcount = 0 if True: start_time = time.time() self.create_clf() print("Train using file:{}".format(self.train_file)) print("Pretrain the model") start = 24903889 start_loops = int(start / batchsize) pos = 0 for i in range(start_loops + 1): if p != None: p.join() if X is not None: del (X) X = None del (labels) del (weights) gc.collect() nrows = batchsize if pos + batchsize > start: nrows = start - pos + 1 if nrows <= 1: break print("Pretrain: pos={}, nrows={}".format(pos, nrows)) if pos <= 0: X, labels, weights = self.read_data_file( self.train_file, 0, nrows) pos += nrows else: skip = pos - batchsize X, labels, weights = self.read_data_file( self.train_file, skip, nrows) pos += nrows p = threading.Thread(target=fit_batch, args=(self.clf, X, labels, weights)) p.start() rcount += start print("Training", rcount, time.time() - start_time) # First train tv = [batchsize, batchsize * 2, batchsize * 3, batchsize * 4] for idx, pos in enumerate(tv): skip = start + pos - batchsize if p != None: p.join() if X is not None: del (X) X = None del (labels) del (weights) gc.collect() X, labels, weights = self.read_data_file( self.train_file, skip, batchsize) rcount += batchsize if idx >= 1: if p != None: p.join() p = threading.Thread(target=evaluate_batch, args=(self.clf, X, labels, rcount)) p.start() if p != None: p.join() print("Training", rcount, time.time() - start_time) p = threading.Thread(target=fit_batch, args=(self.clf, X, labels, weights)) p.start() if p != None: p.join() if X is not None: del (X) X = None del (labels) del (weights) gc.collect() def train_cv(self): p = None X = None rcount = 0 if True: start_time = time.time() train_valids = [ [batchsize, batchsize * 2, batchsize * 3, batchsize * 4], [batchsize * 2, batchsize * 3, batchsize * 4, batchsize], [batchsize, batchsize * 3, batchsize * 4, batchsize * 2], [batchsize, batchsize * 2, batchsize * 4, batchsize * 3], ] all_cv_preds = np.zeros(shape=(4 * batchsize, ), dtype=np.float16) for tv in train_valids: print("Train_CV: tv={}".format(tv)) self.create_clf() print("Train using file:{}".format(self.train_file)) print("Pretrain the model") start = 24903889 start_loops = int(start / batchsize) pos = 0 for i in range(start_loops + 1): if p != None: p.join() if X is not None: del (X) X = None del (labels) del (weights) gc.collect() nrows = batchsize if pos + batchsize > start: nrows = start - pos + 1 if nrows <= 1: break print("Pretrain: pos={}, nrows={}".format(pos, nrows)) if pos <= 0: X, labels, weights = self.read_data_file( self.train_file, 0, nrows) pos += nrows else: skip = pos - batchsize X, labels, weights = self.read_data_file( self.train_file, skip, nrows) pos += nrows p = threading.Thread(target=fit_batch, args=(self.clf, X, labels, weights)) p.start() rcount += start print("Training", rcount, time.time() - start_time) # First train for idx, pos in enumerate(tv[:3]): skip = start + pos - batchsize if p != None: p.join() if X is not None: del (X) X = None del (labels) del (weights) gc.collect() X, labels, weights = self.read_data_file( self.train_file, skip, batchsize) rcount += batchsize if idx % 2 == 1: if p != None: p.join() p = threading.Thread(target=evaluate_batch, args=(self.clf, X, labels, rcount)) p.start() if p != None: p.join() print("Training", rcount, time.time() - start_time) p = threading.Thread(target=fit_batch, args=(self.clf, X, labels, weights)) p.start() if p != None: p.join() if X is not None: del (X) X = None del (labels) del (weights) gc.collect() print("Predict for the validation data") pos = tv[3] skip = start + pos - batchsize X, labels, weights = self.read_data_file( self.train_file, skip, batchsize) pred = predict_batch(self.clf, X) all_cv_preds[pos - batchsize:pos] = np.reshape( pred, (batchsize, )) if X is not None: del (X) X = None del (labels) del (weights) gc.collect() # Save cv result data fname = "%s/cv_pred_%s_%s.csv" % (config.OUTPUT_DIR, "fmftrl", Ver) print("Save cv predictions:{}".format(fname)) df = pd.DataFrame({"predicted": all_cv_preds}) df.to_csv(fname, index=False, columns=["predicted"])
class WBFmFtrlModel(object): wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True}) , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) #clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, # D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def __init__(self,pretrain_files,train_file, test_file): self.pretrain_files = pretrain_files self.train_file = train_file self.test_file = test_file self.clf = None self.pretrain_model_fn = "wb_fmftrl_v26_pretrain.model" def create_clf(self): if self.clf is not None: del(self.clf) gc.collect() self.clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0, D_fm=16, e_noise=0.0, iters=5, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0) def get_data(self, loader, fold= -1, chunk_size=10000000, file_size=40000000): if fold > 0: size_per_fold = int(file_size/fold) else: size_per_fold = chunk_size for (idx, df) in loader.get_chunk_data(): data = df[predictors].values labels = df['click_id'].values weights = df['weight'].values if fold == -1: fold_num = -1 else: fold_num = int(idx / size_per_fold) del(df) gc.collect() str_array = df2csr(data) X = self.wb.transform(str_array) del(str_array) del(data) gc.collect() yield (idx, fold_num, X, labels, weights) def do_thread_execute(self,target,clf, X, labels=None, weights=None,do_free=True): #str_array = df2csr(data) #gc.collect() #X = self.wb.transform(str_array) if labels is not None: args = (clf, X, labels, weights) else: args = (clf, X) p = ThreadWithReturnValue(target=target,args =args) p.start() ret = p.join() if do_free: del(X) if labels is not None: del(labels) if weights is not None: del(weights) gc.collect() return ret def predict(self,predict_file): test_preds = [] click_ids = [] test_loader = DataPiper(predict_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(test_loader): click_ids+= labels.tolist() test_preds += list(self.do_thread_execute(predict_batch,self.clf,X)) return click_ids, test_preds def predict_data(self, X, labels, weights): return predict_batch(self.clf, X) def pretrain(self): p = None X = None rcount = 0 start_time = time.time() self.create_clf() if not os.path.exists(self.pretrain_model_fn): print("Pretrain the model") for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) with open(self.pretrain_model_fn,"wb") as f: params = self.clf.__getstate__() #self.create_clf() pkl.dump(params,f) #self.clf.pickle_model(self.pretrain_model_fn) else: with open(self.pretrain_model_fn,"rb") as f: params = pkl.load(f) self.clf.__setstate__(params) #self.clf.unpickle_model(self.pretrain_model_fn) def train_all(self): p = None X = None rcount = 0 start_time = time.time() self.create_clf() print("Pretrain the model") self.pretrain() """ for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) """ print("Train with file={}".format(self.train_file)) rcount = 0 loader = DataPiper(self.train_file,logger) loops = 0 for (idx, fold_num, X, labels, weights) in self.get_data(loader): if loops % 2 == 0: self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights, do_free=False) loops += 1 rcount += len(labels) print("Training", rcount, time.time() - start_time) self.do_thread_execute(fit_batch,self.clf,X,labels,weights) def train_cv(self): start_time = time.time() nfold = 4 train_preds = [] auc_cv = [0.0 for _ in range(nfold)] for fold in range(nfold): self.create_clf() print("Pretrain models") self.pretrain() """ for pretrain_file in self.pretrain_files: print("Pretrain using file:{}".format(pretrain_file)) loader = DataPiper(pretrain_file,logger) for (idx, fold_num, X, labels, weights) in self.get_data(loader): self.do_thread_execute(fit_batch,self.clf,X,labels,weights) """ print("Train with file={}".format(self.train_file)) file_size = 40000000 all_cv_preds = np.zeros(shape=(file_size,),dtype=np.float32) loader = DataPiper(self.train_file,logger) valid_datas = [] loops = 0 rcount = 0 for (idx, fold_num, X, labels, weights) in self.get_data(loader,fold=nfold,file_size=file_size): print("fold_num={},fold={},nfold={}".format(fold_num,fold,nfold)) if fold_num == fold: valid_datas.append((idx,fold_num,X,labels,weights)) print("Add valid_datas:len={}".format(len(valid_datas))) continue loops += 1 rcount += len(labels) if loops % 2 == 0: self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights,do_free=False) print("Training", rcount, time.time() - start_time) self.do_thread_execute(fit_batch,self.clf,X,labels,weights) print("Predict for the validation data") print("Valid_datas:len={}".format(len(valid_datas))) valid_start_idx = valid_datas[0][0] valid_labels = [] valid_weights = [] valid_ds = [] for d in valid_datas: valid_labels.append(d[3]) valid_weights.append(d[4]) valid_ds.append(d[2]) #print("Valid_ds:d.len={},valid_ds.len={}".format(len(d[2]),len(valid_ds))) num = len(valid_labels) if num > 1: valid_weights = np.concatenate(valid_weights,axis=0) valid_labels = np.concatenate(valid_labels, axis=0) from scipy.sparse import hstack #valid_ds = np.concatenate(valid_ds,axis=0) valid_ds = hstack(valid_ds,axis=0) else: valid_labels = valid_labels[0] valid_weights = valid_weights[0] valid_ds = valid_ds[0] y_pred = self.do_thread_execute(predict_batch,self.clf,valid_ds) num = len(valid_labels) y_pred = np.reshape(y_pred,(num,)) print("y_pred.shape={}".format(y_pred.shape)) print("valid_labels.shape={}".format(valid_labels.shape)) valid_labels = np.reshape(valid_labels,(num,)) train_preds.append((valid_start_idx,num,y_pred)) auc_cv[fold] = dist_utils._auc(valid_labels, y_pred) logger.info(" {:>3} {:>8} {} x {}".format( fold+1, np.round(auc_cv[fold],6), valid_ds.shape[0], valid_ds.shape[1])) #clean up del(valid_datas) del(valid_ds) del(valid_labels) del(valid_weights) gc.collect() # Save cv result data fname = "%s/cv_pred_%s_%s.csv"%(config.OUTPUT_DIR, "fmftrl",Ver) print("Save cv predictions:{}".format(fname)) df = pd.DataFrame({"predicted": all_cv_preds}) df.to_csv(fname, index=False, columns=["predicted"])
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Regex characteristics - carat, gb/tb, cpu ''' def count_rgx(regexls, idx_, filter_=None): colvals = merge['name'][idx_] + ' ' + merge['item_description'][idx_] vals = pd.Series(np.zeros(len(colvals))) for rgx_ in regexls: valsls = colvals.str.findall(rgx_, re.IGNORECASE) vals[vals == 0] += pd.Series([ int(v[0]) if len(set(v)) == 1 else 0 for v in valsls ])[vals == 0] if filter_: vals[~vals.isin(filter_)] = 0. return vals def count_rgx_name(regexls, idx_, filter_=None): colvals = merge['name'][idx_] vals = pd.Series(np.zeros(len(colvals))) for rgx_ in regexls: valsls = colvals.str.findall(rgx_, re.IGNORECASE) vals[vals == 0] += pd.Series( [int(v[0]) if len(v) != 0 else 0 for v in valsls])[vals == 0] if filter_: vals[~vals.isin(filter_)] = 0. return vals # gold measures = np.zeros((merge.shape[0], 4)) ix_chk = ((merge.name.str.contains('gold', case=False)) | \ (merge.item_description.str.contains('gold', case=False))) & \ (merge['subcat_1'] == 'Jewelry') rgxls = [ r"(\d+)k ", r"(\d+)kt ", r"(\d+)k.", r"(\d+)kt.", r"(\d+)k,", r"(\d+)kt,", r"(\d+) k ", r"(\d+) kt", r"(\d+) k.", r"(\d+) kt.", r"(\d+) k,", r"(\d+) kt," ] measures[ix_chk, 0] = count_rgx(rgxls, ix_chk, filter_=[10, 12, 14, 16, 18, 20, 21, 22, 23, 24]) # phone memory ix_chk = (merge['subcat_2'] == 'Cell Phones & Smartphones') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 1] = count_rgx(rgxls, ix_chk) # console memory ix_chk = (merge['subcat_2'] == 'Consoles') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 2] = count_rgx(rgxls, ix_chk) # computer memory ix_chk = (merge['category_name'] == 'Electronics/Computers & Tablets/Laptops & Netbooks') | \ (merge['category_name'] == 'Electronics/Computers & Tablets/Desktops & All-In-Ones') rgxls = [ r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,", r"(\d+) gb," ] measures[ix_chk, 3] = count_rgx(rgxls, ix_chk) # cpu # oz # diamond #r"(\d+) karat ", r"(\d+) carat " ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Encode Original Strings ''' ''' for col in ['item_description', 'name']: lb = LabelBinarizer(sparse_output=True) if 'X_orig' not in locals(): X_orig = lb.fit_transform(merge[col].apply(hash)) else: X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash)))) X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash)))) X_orig = X_orig.tocsr() X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)] X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)] print ('Shape of original hash', X_orig.shape) X_orig = X_orig.tocoo() ''' gc.collect() cpuStats() ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) X_memory = lb.fit_transform(merge['measure_memory']) mask = np.array(np.clip(X_memory.getnnz(axis=0) - 10**6, 1, 0), dtype=bool) X_memory = X_memory[:, mask] X_gold = lb.fit_transform(merge['measure_gold']) mask = np.array(np.clip(X_gold.getnnz(axis=0) - 10**6, 1, 0), dtype=bool) X_gold = X_gold[:, mask] print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_orig.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_orig)).tocsr() ''' print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape, X_memory, X_gold) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col, X_memory, X_gold)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 # 0.419741 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
text_seq_cols = ["name", "item_description"] fill_missing(all_data, text_cols, num_cols, bin_cols) all_data["all_text"] = all_data["brand_name"].astype(str) + " " + all_data[ "name"].astype(str) + " " + all_data['item_description'] all_data["name_brand"] = all_data["brand_name"].astype( str) + " " + all_data["name"].astype(str) wb = wordbatch.WordBatch( normalize_text, extractor=( WordBag, { "hash_ngrams": 1, # "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_all_text = wb.fit_transform(all_data['all_text']) del (wb) X_all_text = X_all_text[:, np.array(np.clip(X_all_text.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `all text` completed.'.format(time.time() - start_time)) print(X_all_text.shape)
dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing' dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing' cutting(full_df) stopwords = {x: 1 for x in stopwords.words('english')} non_alphanums = re.compile(u'[^A-Za-z0-9]+') def normalize_text(text): return u" ".join( [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \ if x not in stopwords]) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag,\ {"hash_ngrams": 3, "hash_ngrams_weights": [1.6, 0.8, 0.4], "hash_size": 2 ** 28, "norm": "l2", "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze= True X_name = wb.fit_transform(full_df['name']) del(wb) #X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_name = X_name[:, np.where(X_name.getnnz(axis=0) > 2)[0]] #print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(full_df['subcat_0']) X_category2 = wb.fit_transform(full_df['subcat_1']) X_category3 = wb.fit_transform(full_df['subcat_2']) #print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))
import re def normalize_text(text): return u" ".join( [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \ if len(x) > 1 and x not in stopwords]) merge = pd.read_pickle('merge.pkl') wb = wordbatch.WordBatch(extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None }), procs=0) merge['name'] = merge['name'].map(lambda x: normalize_text(x)) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
def wordbatch_algo(test): import time print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: # train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') # test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') train = pd.read_table('../input/train.tsv', engine='c') # test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge: pd.DataFrame = pd.concat([train, dftt, test]) # submission: pd.DataFrame = test[['test_id']] ''' # Mean of each group # https://stackoverflow.com/questions/30244952/python-pandas-create-new-column-with-groupby-sum cat_mean = train['price'].groupby(train['category_name']).mean() cat_mean = pd.DataFrame({'category_name':cat_mean.index, 'cat_mean':cat_mean.values}) merge = merge.merge(cat_mean, on=['category_name'], how='left') # print(merge.head()) X_cat_mean = merge['cat_mean'].as_matrix().reshape(-1, 1) # X_cat_mean = normalize(np.nan_to_num(X_cat_mean).reshape(-1, 1), norm='max') cond_mean = train['price'].groupby(train['item_condition_id']).mean() cond_mean = pd.DataFrame({'item_condition_id':cond_mean.index, 'cond_mean':cond_mean.values}) merge = merge.merge(cond_mean, on=['item_condition_id'], how='left') X_cond_mean = merge['cond_mean'].as_matrix().reshape(-1, 1) brand_mean = train['price'].groupby(train['brand_name']).mean() brand_mean = pd.DataFrame({'brand_name':brand_mean.index, 'brand_mean':brand_mean.values}) merge = merge.merge(brand_mean, on=['brand_name'], how='left') X_brand_mean = merge['brand_mean'].as_matrix().reshape(-1, 1) ship_mean = train['price'].groupby(train['shipping']).mean() ship_mean = pd.DataFrame({'shipping':ship_mean.index, 'ship_mean':ship_mean.values}) merge = merge.merge(ship_mean, on=['shipping'], how='left') X_ship_mean = merge['ship_mean'].as_matrix().reshape(-1, 1) ''' del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc = merge['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name = merge['name'].apply(lambda x: len(x)).as_matrix().reshape( -1, 1) # X_len_description = normalize(np.nan_to_num(X_len_description).reshape(-1, 1), norm='max') # X_len_name = normalize(np.nan_to_num(X_len_name).reshape(-1, 1), norm='max') print('[{}] Length `item_description` completed.'.format(time.time() - start_time)) print('[{}] Length `name` completed.'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print( X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape ) #, X_glove.shape, X_len_description.shape, X_len_name.shape, X_cat_mean.shape) # sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=FM_iter, inv_link="identity", threads=4) model.fit(train_X, train_y) gc.collect() print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) gc.collect() print(predsFM) #model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=FTRL_iter, inv_link="identity", threads=1) del X gc.collect() model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) print(predsF) del train_X, train_y del X_test return predsFM, predsF
def getFMFTRL(moddict): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 8 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() cpuStats() merge = prepFMFeatures(merge) cpuStats() merge.head() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] moddict['cross_cols'] = {} for i in range(0, len(cross_nm)): moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True) moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]]) if i == 0: x_col = moddict['cross_cols'][cross_nm[i]].transform( merge[cross_nm[i]]) else: x_col = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( merge[cross_nm[i]]))) del merge[cross_nm[i]] gc.collect() cpuStats() ''' Hash name ''' moddict['wb_name'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, 'verbose': 1, }), procs=8) moddict['wb_name'].dictionary_freeze = True X_name = moddict['wb_name'].fit_transform(merge['name']) moddict['wb_name_mask'] = np.array(np.clip( X_name[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category #2 ''' moddict['wb_cat'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=4) moddict['wb_cat'].dictionary_freeze = True ### This must be the full dataset cats = merge["category_name"].str.replace('/', ' ').unique() moddict['wb_cat'].fit(cats) X_cat_tmp = moddict['wb_cat'].transform(cats) moddict['wb_cat_dict'] = dict([ (c, X_cat_tmp.getrow(row)) for (c, row) in zip(cats.tolist(), range(len(cats))) ]) X_cat = vstack(([ moddict['wb_cat_dict'][c] for c in merge["category_name"].str.replace('/', ' ') ])) moddict['wb_cat_mask'] = np.array(np.clip( X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) X_cat = X_cat[:, moddict['wb_cat_mask']] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' moddict['wb_cat_ctgc'] = CountVectorizer() moddict['wb_cat_ctgc'].fit(merge['general_cat']) X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat']) moddict['wb_cat_ctsc1'] = CountVectorizer() moddict['wb_cat_ctsc1'].fit(merge['subcat_1']) X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1']) moddict['wb_cat_ctsc2'] = CountVectorizer() moddict['wb_cat_ctsc2'].fit(merge['subcat_2']) X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 0.6], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None }), procs=8) moddict['wb_dscr'].dictionary_freeze = True X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' + merge['item_description']) moddict['wb_dscr_mask'] = np.array(np.clip( X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) X_description = X_description[:, moddict['wb_dscr_mask']] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) moddict['wb_brandname'] = LabelBinarizer(sparse_output=True) moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train]) X_brand = moddict['wb_brandname'].transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True) moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train]) X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id']) print('[{}] Label binarize `item_condition_id` completed.'.format( time.time() - start_time)) moddict['wb_shipping'] = LabelBinarizer(sparse_output=True) moddict['wb_shipping'].fit(merge['shipping'][:nrow_train]) X_shipping = moddict['wb_shipping'].transform(merge['shipping']) print('[{}] Label binarize `shipping` completed.'.format(time.time() - start_time)) print( X_itemcond.shape, X_shipping.shape, #X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack(( X_itemcond, X_shipping, #X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) moddict['all_mask'] = np.array(np.clip( sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, moddict['all_mask']] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0004: baseline = score_ else: break # 0.41357 moddict['FMmodel'] = model print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = moddict['FMmodel'].predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = moddict['FMmodel'].predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, moddict, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
#from sklearn.feature_extraction.text import HashingVectorizer #from sklearn.linear_model import * #vct= HashingVectorizer() #clf= SGDRegressor() import wordbatch from wordbatch.models import FTRL from wordbatch.extractors import WordBag wb= wordbatch.WordBatch(extractor=(WordBag, {"hash_ngrams":2, "hash_ngrams_weights":[0.5, -1.0], "hash_size":2**23, "norm":'l2', "tf":'log', "idf":50.0})) clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1) train_texts= ["Cut down a tree with a herring? It can't be done.", "Don't say that word.", "How can we not say the word if you don't tell us what it is?"] train_labels= [1, 0, 1] test_texts= ["Wait! I said it! I said it! Ooh! I said it again!"] values = wb.transform(train_texts) clf.fit(values, train_labels) preds= clf.predict(wb.transform(test_texts)) print("values={}".format(values)) print("values={}".format(len(values))) print("texts={}".format(test_texts)) print("transformed={}".format(wb.transform(test_texts))) print(preds)
def join(self): threading.Thread.join(self) return self._return batchsize = 10000000 D = 2**20 wb = wordbatch.WordBatch(None, extractor=(WordHash, { "ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True }), minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,
K_model.fit(X_train_K, Y_train, epochs=4, batch_size=batch_size, verbose=10) print("Keras model training completed!") X_train.drop([ "main_catL", "subcat1L", "subcat2L", "brand_nameL", "seq_product_desc", "price_leak" ], axis=1) del X_train_K, raw_text gc.collect() cutting(X_train) to_categorical(X_train) wb1 = wordbatch.WordBatch(normalize_text, \ extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None,}), procs = 8) wb1.dictionary_freeze = True wb1.fit(X_train["name"]) X_train_name = wb1.transform(X_train["name"]) mask1 = np.where(X_train_name.getnnz(axis=0) > 1)[0] X_train_name = X_train_name[:, mask1] wb2 = wordbatch.WordBatch(normalize_text, \ extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0, "idf": None}), procs = 8) wb2.dictionary_freeze = True wb2.fit(X_train["item_description"]) X_train_description = wb2.transform(X_train["item_description"]) mask2 = np.where(X_train_description.getnnz(axis=0) > 1)[0]
how='left', on='FileID') # Sequence of ProductID tmp = log_data.groupby('FileID')['ProductID_le'].apply(list) data = pd.merge(data, tmp.to_frame().reset_index(), how='left', on='FileID') log_data.drop(['CustomerID_le', 'ProductID_le'], axis=1, inplace=True) print('Sequentail lebal encoding completed.') wb = wordbatch.WordBatch(list2str, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": "l2", "tf": 'binary', "idf": None }), procs=8) wb.dictionary_freeze = True X_cust = wb.fit_transform(data['CustomerID_le']) X_cust = X_cust[:, np.where(X_cust.getnnz(axis=0) > 1)[0]] print('Shape of X_cust: {0}'.format(X_cust.shape)) del wb save_sparse_csr(data_path + '/cust_{0}.npz'.format('v1'), X_cust) print('Vectorize `CustomerID` completed.') wb = wordbatch.WordBatch(list2str, extractor=(WordBag, { "hash_ngrams": 2,
def trainFMFTRL(moddict): merge = pd.read_csv(trn_file, sep='\t', encoding='utf-8') mergetst = pd.read_csv(tst_file, sep='\t', encoding='utf-8') #test = pd.read_csv(tst_file, sep='\t', encoding='utf-8') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', merge.shape) dftt = merge[(merge.price < 1.0)] merge = merge.drop(merge[(merge.price < 1.0)].index) del dftt['price'] nrow_train = merge.shape[0] # print(nrow_train, nrow_test) y = np.log1p(merge["price"]) merge = pd.concat([merge, dftt]) merge['target'] = np.log1p(merge["price"]) #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(merge[:nrow_train].shape[0]), random_state=233, train_size=0.90) gc.collect() cpuStats() merge = prepFMFeatures(merge) mergetst = prepFMFeatures(mergetst) cpuStats() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] moddict['cross_cols'] = {} for i in range(0, len(cross_nm)): moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True) moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]]) if i == 0: x_col = moddict['cross_cols'][cross_nm[i]].transform( merge[cross_nm[i]]) else: x_col = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( merge[cross_nm[i]]))) del merge[cross_nm[i]] gc.collect() cpuStats() ''' Test Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] for i in range(0, len(cross_nm)): if i == 0: x_coltst = moddict['cross_cols'][cross_nm[i]].transform( mergetst[cross_nm[i]]) else: x_coltst = hstack( (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform( mergetst[cross_nm[i]]))) del mergetst[cross_nm[i]] gc.collect() cpuStats() ''' Hash name ''' moddict['wb_name'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, 'verbose': 1, }), procs=8) moddict['wb_name'].dictionary_freeze = True X_name = moddict['wb_name'].fit_transform(merge['name']) moddict['wb_name_mask'] = np.where( X_name[:nrow_train].getnnz(axis=0) > 0)[0] X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' test Hash name ''' X_name = moddict['wb_name'].transform(mergetst['name']) X_name = X_name[:, moddict['wb_name_mask']] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category #2 ''' moddict['wb_cat'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=4) moddict['wb_cat'].dictionary_freeze = True ### This must be the full dataset #cats = merge["category_name"].str.replace('/', ' ').unique() moddict['wb_cat'].fit(categories) X_cat_tmp = moddict['wb_cat'].transform(categories) moddict['wb_cat_dict'] = dict([ (c, X_cat_tmp.getrow(row)) for (c, row) in zip(categories.tolist(), range(len(categories))) ]) X_cat = vstack(([ moddict['wb_cat_dict'][c] for c in merge["category_name"].str.replace('/', ' ') ])) #moddict['wb_cat_mask'] = np.array(np.clip(X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_cat_mask'] = np.where(X_cat[:nrow_train].getnnz(axis=0) > 0)[0] X_cat = X_cat[:, moddict['wb_cat_mask']] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' moddict['wb_cat_ctgc'] = CountVectorizer() moddict['wb_cat_ctgc'].fit(merge['general_cat']) X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat']) moddict['wb_cat_ctsc1'] = CountVectorizer() moddict['wb_cat_ctsc1'].fit(merge['subcat_1']) X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1']) moddict['wb_cat_ctsc2'] = CountVectorizer() moddict['wb_cat_ctsc2'].fit(merge['subcat_2']) X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 0.6], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None }), procs=8) moddict['wb_dscr'].dictionary_freeze = True X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' + merge['item_description']) #moddict['wb_dscr_mask'] = np.array(np.clip(X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool) moddict['wb_dscr_mask'] = np.where( X_description[:nrow_train].getnnz(axis=0) > 1)[0] X_description = X_description[:, moddict['wb_dscr_mask']] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) moddict['wb_brandname'] = LabelBinarizer(sparse_output=True) moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train]) X_brand = moddict['wb_brandname'].transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True) moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train]) X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id']) print('[{}] Label binarize `item_condition_id` completed.'.format( time.time() - start_time)) moddict['wb_shipping'] = LabelBinarizer(sparse_output=True) moddict['wb_shipping'].fit(merge['shipping'][:nrow_train]) X_shipping = moddict['wb_shipping'].transform(merge['shipping']) print('[{}] Label binarize `shipping` completed.'.format(time.time() - start_time)) print( X_itemcond.shape, X_shipping.shape, #X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack(( X_itemcond, X_shipping, #X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) print(50 * '-') cpuStats() print(50 * '-') # Remove features with document frequency <=1 print(sparse_merge.shape) gc.collect() sparse_merge, y = sparse_merge[:nrow_train], y[:nrow_train] if develop: train_X, valid_X, train_y, valid_y = sparse_merge[trnidx], \ sparse_merge[validx], \ y.values[trnidx], y.values[validx] del sparse_merge gc.collect() print(50 * '*') cpuStats() print(50 * '*') print(train_X.shape[1]) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=4) #iters=15 print(50 * '|') cpuStats() print(50 * '|') baseline = 1. for i in range(15): print(50 * '-') cpuStats() print(50 * '-') model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline - 0.0004: baseline = score_ else: break moddict['FMmodel'] = model print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = moddict['FMmodel'].predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) gc.collect() return merge, moddict, trnidx, validx, nrow_train, predsfm
class FMFtrlModel(object): wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word", "lowercase": False, "n_features": D, "norm": None, "binary": True}) , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0) def __init__(self,config): self.config = config self._build() def _build(self): D_fm = self.config['D_fm'] iters = self.config['iters'] e_clip = self.config['e_clip'] alpha_fm = self.config['alpha_fm'] weight_fm = self.config['weight_fm'] threads = 8 clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=alpha_fm, L2_fm=0.0, init_fm=0.01, weight_fm=weight_fm, D_fm=D_fm, e_noise=0.0, iters=iters, inv_link="sigmoid", e_clip=e_clip, threads=threads, use_avx=1, verbose=0) self.model = clf def fit(self,data, y, validate=True, weight=None): total_data = len(data) p = None X = None rcount = 0 start_time = time.time() #cpuStats() step = 200000 epochs = int(total_data/step)+1 for epoch in range(epochs): start = epoch * step end = start + step if start >= total_data: break if end > total_data: end = total_data str_array = df2csr(data[start:end]) labels = y[start:end] if weight is not None: W = weight[start:end] else: W = None if p != None: p.join() if X is not None: del(X) gc.collect() X= self.wb.transform(str_array) del(str_array) gc.collect() rcount += step if rcount % (2 * step) == 0: if p != None: p.join() p = threading.Thread(target=evaluate_batch, args=(self.model, X, labels, rcount)) p.start() print("Training", rcount, time.time() - start_time) if p != None: p.join() p = threading.Thread(target=fit_batch, args=(self.model, X, labels, W)) p.start() if p != None: p.join() del(X) gc.collect() def predict(self,X_train,weight=None): p = None test_preds = [] click_ids = [] str_array = df2csr(X_train) del(X_train) gc.collect() X = self.wb.transform(str_array) del(str_array) gc.collect() p = ThreadWithReturnValue(target=predict_batch, args=(self.model, X)) p.start() if p != None: test_preds += list(p.join()) del(X) gc.collect() return test_preds
# Imprime cantiadad de noticias print("Listo, cantidad de noticias....") print(corpus.shape) # Calculo TF-IDF n_docs = len(corpus['Cuerpo'].tolist()) n_cpu = 2 batch_size = int(n_docs/n_cpu) _n_words = 500 extractor=(WordBag, {"hash_ngrams": 1, "hash_ngrams_weights": [1.0, 1.0],\ "hash_size": 2**22, "norm": "l2", "tf": 1.0,"idf": 1.0}) wb = wordbatch.WordBatch(normalize_text,\ extractor= extractor,\ procs= n_cpu,\ minibatch_size= batch_size) #WORBBAG_ITEM_DESC_PARAMS = {'hash_ngrams': 2, 'hash_ngrams_weights': [1.0, 1.0], # 'hash_size': 2 ** 26, 'norm': 'l2', 'tf': 1.0, 'idf': None} #wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, WORBBAG_ITEM_DESC_PARAMS),\ # procs= n_cpu) #wb = wordbatch.WordBatch(normalize_text,\ # extractor= extractor, procs = n_cpu ) #procs= n_cpu, n_words= 500, minibatch_size= batch_size) #wb.use_sc = True wb.dictionary_freeze = True # b = Batcher(procs=n_cpu, minibatch_size=batch_size, use_sc=True)0 # lista = pd.DataFrame([corpus['Cuerpo'].tolist()])
def Split_Train_Test_FTRL(merge: pd.DataFrame, hand_feature, start_time): desc_w1 = param_space_best_WordBatch['desc_w1'] desc_w2 = param_space_best_WordBatch['desc_w2'] name_w1 = param_space_best_WordBatch['name_w1'] name_w2 = param_space_best_WordBatch['name_w2'] merge['brand_name'] = inductive_brand(merge[['brand_name', 'name']]) wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [name_w1, name_w2], "hash_size": 2**28, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']).astype(np.float32) del wb merge.drop(['name'], axis=1, inplace=True) X_name = X_name[:, np.array(np.clip(X_name[:TRAIN_SIZE].getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [desc_w1, desc_w2], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description_train = wb.fit_transform( merge['item_description'][:TRAIN_SIZE]).astype(np.float32) mask = np.array(np.clip(X_description_train.getnnz(axis=0) - 1, 0, 1), dtype=bool) X_description_train = X_description_train[:, mask] print('X_description_train done') valid_len = merge.shape[0] - TRAIN_SIZE valid_len1, valid_len2 = int(valid_len / 3), int(valid_len * 2 / 3) X_description_test1 = wb.fit_transform( merge['item_description'][TRAIN_SIZE:TRAIN_SIZE + valid_len1]).astype( np.float32) X_description_test1 = X_description_test1[:, mask] print('X_description_test1 done') X_description_test2 = wb.fit_transform( merge['item_description'][TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2]).astype(np.float32) X_description_test2 = X_description_test2[:, mask] print('X_description_test2 done') X_description_test3 = wb.fit_transform( merge['item_description'][TRAIN_SIZE + valid_len2:]).astype(np.float32) X_description_test3 = X_description_test3[:, mask] print('X_description_test3 done') del wb, mask merge.drop(['item_description'], axis=1, inplace=True) print(X_description_train.shape, X_description_test1.shape, X_description_test2.shape, X_description_test3.shape) print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) X_category1, X_category2, X_category3, X_brand = Get_Vectorizor(merge) merge.drop(['category_1', 'category_2', 'category_name', 'brand_name'], axis=1, inplace=True) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values.astype(np.float32)) merge.drop(['item_condition_id', 'shipping'], axis=1, inplace=True) X_hand_feature = merge[hand_feature].values.astype(np.float32) merge.drop(hand_feature, axis=1, inplace=True) print('-' * 50) # coo_matrix X_train = hstack((X_dummies[:TRAIN_SIZE], X_brand[:TRAIN_SIZE], X_category1[:TRAIN_SIZE], X_category2[:TRAIN_SIZE], X_category3[:TRAIN_SIZE], X_hand_feature[:TRAIN_SIZE], X_name[:TRAIN_SIZE], X_description_train), dtype=np.float32) print(X_description_train.shape) X_description_train = None gc.collect() print('-' * 50) X_test1 = hstack( (X_dummies[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_brand[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_category1[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_category2[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_category3[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_hand_feature[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_name[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_description_test1), dtype=np.float32) X_description_test1 = None gc.collect() print('-' * 50) X_test2 = hstack( (X_dummies[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2], X_brand[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2], X_category1[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2], X_category2[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2], X_category3[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2], X_hand_feature[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2], X_name[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2], X_description_test2), dtype=np.float32) X_description_test2 = None gc.collect() print('-' * 50) X_test3 = hstack((X_dummies[TRAIN_SIZE + valid_len2:], X_brand[TRAIN_SIZE + valid_len2:], X_category1[TRAIN_SIZE + valid_len2:], X_category2[TRAIN_SIZE + valid_len2:], X_category3[TRAIN_SIZE + valid_len2:], X_hand_feature[TRAIN_SIZE + valid_len2:], X_name[TRAIN_SIZE + valid_len2:], X_description_test3), dtype=np.float32) X_description_test3 = None gc.collect() print(X_dummies.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_hand_feature.shape, X_name.shape, X_train.shape, X_test1.shape, X_test2.shape, X_test3.shape) X_dummies, X_brand, X_category1, X_category2, X_category3, X_hand_feature, X_name = None, None, None, None, None, None, None gc.collect() # csr_matrix X_train = X_train.tocsr() print('[{}] X_train completed.'.format(time.time() - start_time)) X_test1 = X_test1.tocsr() print('[{}] X_test1 completed.'.format(time.time() - start_time)) X_test2 = X_test2.tocsr() print('[{}] X_test2 completed.'.format(time.time() - start_time)) X_test3 = X_test3.tocsr() print('[{}] X_test3 completed.'.format(time.time() - start_time)) return X_train, X_test1, X_test2, X_test3
def getFMFTRL(): #os.chdir('/Users/dhanley2/Documents/mercari/data') os.chdir('/home/darragh/mercari/data') train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8') test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8') glove_file = '../feat/glove.6B.50d.txt' threads = 4 save_dir = '../feat' print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) merge['target'] = np.log1p(merge["price"]) submission = test[['test_id']] ix = (merge['brand_name'] == merge['brand_name']) & ( ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin( merge['name'].str.lower())) merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix] #EXTRACT DEVELOPTMENT TEST trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90) del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) #merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) ''' Crossed columns ''' # my understanding on how to replicate what layers.crossed_column does. One # can read here: https://www.tensorflow.org/tutorials/linear. def cross_columns(x_cols): """simple helper to build the crossed columns in a pandas dataframe """ crossed_columns = dict() colnames = ['_'.join(x_c) for x_c in x_cols] for cname, x_c in zip(colnames, x_cols): crossed_columns[cname] = x_c return crossed_columns merge['item_condition_id_str'] = merge['item_condition_id'].astype(str) merge['shipping_str'] = merge['shipping'].astype(str) x_cols = ( ['brand_name', 'item_condition_id_str'], ['brand_name', 'subcat_1'], ['brand_name', 'subcat_2'], ['brand_name', 'general_cat'], #['brand_name', 'subcat_1', 'item_condition_id_str'], #['brand_name', 'subcat_2', 'item_condition_id_str'], #['brand_name', 'general_cat', 'item_condition_id_str'], ['brand_name', 'shipping_str'], ['shipping_str', 'item_condition_id_str'], ['shipping_str', 'subcat_2'], ['item_condition_id_str', 'subcat_2']) crossed_columns_d = cross_columns(x_cols) categorical_columns = list(merge.select_dtypes(include=['object']).columns) D = 2**30 for k, v in crossed_columns_d.items(): print('Crossed column ', k) outls_ = [] indicator = 0 for col in v: outls_.append((np.array(merge[col].apply(hash))) % D + indicator) indicator += 10**6 merge[k] = sum(outls_).tolist() ''' Count crossed cols ''' cross_nm = [k for k in crossed_columns_d.keys()] lb = LabelBinarizer(sparse_output=True) x_col = lb.fit_transform(merge[cross_nm[0]]) for i in range(1, len(cross_nm)): x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]]))) del (lb) ''' Hash name ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) ''' Hash category ''' wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**20, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True cat = merge["category_name"].str.replace('/', ' ') X_cat = wb.fit_transform(cat) del (wb) X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `category` completed.'.format(time.time() - start_time)) ''' Count category ''' wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**29, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape, X_cat.shape, x_col.shape) sparse_merge = hstack( (X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat, x_col)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] print(sparse_merge.shape) X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] mask = np.array(np.clip(X.getnnz(axis=0) - 1, 0, 1), dtype=bool) X = X[:, mask] X_test = X_test[:, mask] print(X.shape) gc.collect() if develop: #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233) train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[ trnidx], y.values[validx] model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15 baseline = 1. for i in range(15): model.fit(train_X, train_y, verbose=1) predsfm = model.predict(X=valid_X) score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm)) print("FM_FTRL dev RMSLE:", score_) if score_ < baseline: baseline = score_ else: break print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: predsfm = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm))) # 0.44532 # Full data 0.424681 predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
# normalize_text= default_normalize_text, spellcor_count=0, spellcor_dist= 2, n_words= 10000000, # min_df= 0, max_df= 1.0, raw_min_df= -1, procs= 0, verbose= 1, minibatch_size= 20000, vectorizer = HashingVectorizer(preprocessor=normalize_text, decode_error='ignore', n_features=2**23, non_negative=False, ngram_range=(1, 2), norm='l2') start = time.time() X = vectorizer.fit_transform(df['text_normalized']) print("Process time: {}".format(time.time() - start)) print(X.shape) start = time.time() wb = wordbatch.WordBatch(normalize_text, extractor=(WordHash, { "decode_error": 'ignore', "n_features": 2**23, "non_negative": False, "ngram_range": (1, 2), "norm": 'l2' }), procs=8 #, method="serial" ) Xwb = wb.fit_transform(df['text_normalized'].values) print("Process time: {}".format(time.time() - start)) print(Xwb.shape)
def wordbatch_algo(): import time # print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) train = pd.read_table('../input/train.tsv', engine='c') # Drop rows where price = 0 train = train[train.price != 0].reset_index(drop=True) print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) y = np.log1p(train["price"]) nrow_train = train.shape[0] # Training train['general_cat'], train['subcat_1'], train['subcat_2'] = \ zip(*train['category_name'].apply(lambda x: split_cat(x))) train.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(train) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(train) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(train) print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc = train['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name = train['name'].apply(lambda x: len(x)).as_matrix().reshape( -1, 1) print('[{}] Length of text completed.'.format(time.time() - start_time)) # Name wb_name = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb_name.dictionary_freeze = True wb_name.fit(train['name']) X_name = wb_name.transform(train['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb_cat1 = CountVectorizer() wb_cat2 = CountVectorizer() wb_cat3 = CountVectorizer() wb_cat1.fit(train['general_cat']) wb_cat2.fit(train['subcat_1']) wb_cat3.fit(train['subcat_2']) X_category1 = wb_cat1.transform(train['general_cat']) X_category2 = wb_cat2.transform(train['subcat_1']) X_category3 = wb_cat3.transform(train['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb_desc = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb_desc.dictionary_freeze = True wb_desc.fit(train['item_description']) X_description = wb_desc.transform(train['item_description']) # X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) lb.fit(train['brand_name']) X_brand = lb.transform(train['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_cond, d_cond = fit_dummy(train['item_condition_id'].tolist()) X_ship, d_ship = fit_dummy(train['shipping'].tolist()) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) del train gc.collect() print(X_cond.shape, X_ship.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_cond, X_ship, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) del X_description, X_brand, X_category1, X_category2, X_category3, X_name gc.collect() # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] print(sparse_merge.shape) X = sparse_merge # --------------------------------------- # FM model fit train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split( X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=train_X.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=FM_iter, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train FM_FTRL completed'.format(time.time() - start_time)) print('-' * 20) if develop: preds = model.predict(X=valid_X) print("->>>> FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # --------------------------------------- # FTRL model fit model2 = FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=1.0, D=train_X.shape[1], iters=FTRL_iter, inv_link="identity", threads=1) # del X; gc.collect() model2.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model2.predict(X=valid_X) print("->>>> FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) # Clear variables: del X, train_X, train_y, sparse_merge gc.collect() # --------------------------------------- # Testing by chunk print(' FM/FTRL: ...reading the test data...') predsFM = [] predsF = [] for test in load_test(): test['general_cat'], test['subcat_1'], test['subcat_2'] = \ zip(*test['category_name'].apply(lambda x: split_cat(x))) test.drop('category_name', axis=1, inplace=True) handle_missing_inplace(test) #print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(test) # print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(test) # print('[{}] Convert categorical completed'.format(time.time() - start_time)) # Add some new features: X_len_desc_test = test['item_description'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_len_name_test = test['name'].apply( lambda x: len(x)).as_matrix().reshape(-1, 1) X_name_test = wb_name.transform(test['name']) # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_category1_test = wb_cat1.transform(test['general_cat']) X_category2_test = wb_cat2.transform(test['subcat_1']) X_category3_test = wb_cat3.transform(test['subcat_2']) X_description_test = wb_desc.transform(test['item_description']) # X_description_test = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] X_brand_test = lb.transform(test['brand_name']) X_cond_test = transform_dummy(test['item_condition_id'].tolist(), d_cond) X_ship_test = transform_dummy(test['shipping'].tolist(), d_ship) X_test = hstack((X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, \ X_category2_test, X_category3_test, X_name_test)).tocsr() X_test = X_test[:, mask] # Clear variables: del X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, X_category2_test, X_category3_test, X_name_test del test gc.collect() predsFM_batch = model.predict(X_test) predsFM += np.array(predsFM_batch).flatten().tolist() predsF_batch = model2.predict(X_test) predsF += np.array(predsF_batch).flatten().tolist() print(np.array(predsFM)) print('-' * 20) print(np.array(predsF)) print('-' * 20) return np.array(predsFM), np.array(predsF)
def main(): start_time = time.time() from time import gmtime, strftime print(strftime("%Y-%m-%d %H:%M:%S", gmtime())) # if 1 == 1: ###train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c') ###test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c') train = pd.read_table('../input/train.tsv', engine='c') test = pd.read_table('../input/test.tsv', engine='c') print('[{}] Finished to load data'.format(time.time() - start_time)) print('Train shape: ', train.shape) print('Test shape: ', test.shape) nrow_test = train.shape[0] # -dftt.shape[0] dftt = train[(train.price < 1.0)] train = train.drop(train[(train.price < 1.0)].index) del dftt['price'] nrow_train = train.shape[0] # print(nrow_train, nrow_test) y = np.log1p(train["price"]) merge = pd.concat([train, dftt, test]) submission = test[['test_id']] del train del test gc.collect() merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \ zip(*merge['category_name'].apply(lambda x: split_cat(x))) merge.drop('category_name', axis=1, inplace=True) print('[{}] Split categories completed.'.format(time.time() - start_time)) handle_missing_inplace(merge) print('[{}] Handle missing completed.'.format(time.time() - start_time)) cutting(merge) print('[{}] Cut completed.'.format(time.time() - start_time)) to_categorical(merge) print('[{}] Convert categorical completed'.format(time.time() - start_time)) wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2**29, "norm": None, "tf": 'binary', "idf": None, }), procs=8) wb.dictionary_freeze = True X_name = wb.fit_transform(merge['name']) del (wb) X_name = X_name[:, np. array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `name` completed.'.format(time.time() - start_time)) wb = CountVectorizer() X_category1 = wb.fit_transform(merge['general_cat']) X_category2 = wb.fit_transform(merge['subcat_1']) X_category3 = wb.fit_transform(merge['subcat_2']) print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time)) # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5], wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, { "hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0], "hash_size": 2**28, "norm": "l2", "tf": 1.0, "idf": None }), procs=8) wb.dictionary_freeze = True X_description = wb.fit_transform(merge['item_description']) del (wb) X_description = X_description[:, np.array(np.clip( X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)] print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time)) lb = LabelBinarizer(sparse_output=True) X_brand = lb.fit_transform(merge['brand_name']) print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time)) X_dummies = csr_matrix( pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values) print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'. format(time.time() - start_time)) print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape, X_name.shape) sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr() print('[{}] Create sparse merge completed'.format(time.time() - start_time)) # pd.to_pickle((sparse_merge, y), "xy.pkl") # else: # nrow_train, nrow_test= 1481661, 1482535 # sparse_merge, y = pd.read_pickle("xy.pkl") # Remove features with document frequency <=1 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) gc.collect() train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1) model.fit(train_X, train_y) print('[{}] Train FTRL completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsF = model.predict(X_test) print('[{}] Predict FTRL completed'.format(time.time() - start_time)) model = FM_FTRL(alpha=0.01, beta=0.01, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.01, L2_fm=0.0, init_fm=0.01, D_fm=200, e_noise=0.0001, iters=15, inv_link="identity", threads=4) model.fit(train_X, train_y) print('[{}] Train ridge v2 completed'.format(time.time() - start_time)) if develop: preds = model.predict(X=valid_X) print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsFM = model.predict(X_test) print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time)) params = { 'learning_rate': 0.6, 'application': 'regression', 'max_depth': 4, 'num_leaves': 31, 'verbosity': -1, 'metric': 'RMSE', 'data_random_seed': 1, 'bagging_fraction': 0.6, 'bagging_freq': 5, 'feature_fraction': 0.6, 'nthread': 4, 'min_data_in_leaf': 100, 'max_bin': 31 } # Remove features with document frequency <=100 print(sparse_merge.shape) mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1), dtype=bool) sparse_merge = sparse_merge[:, mask] X = sparse_merge[:nrow_train] X_test = sparse_merge[nrow_test:] print(sparse_merge.shape) train_X, train_y = X, y if develop: train_X, valid_X, train_y, valid_y = train_test_split(X, y, test_size=0.05, random_state=100) d_train = lgb.Dataset(train_X, label=train_y) watchlist = [d_train] if develop: d_valid = lgb.Dataset(valid_X, label=valid_y) watchlist = [d_train, d_valid] model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \ early_stopping_rounds=1000, verbose_eval=1000) if develop: preds = model.predict(valid_X) print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds))) predsL = model.predict(X_test) print('[{}] Predict LGB completed.'.format(time.time() - start_time)) preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5) submission['price'] = np.expm1(preds) submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)