コード例 #1
0
	def __init__(self, pickle_model="", datadir=None):
		from pyspark import SparkContext
		self.sc= SparkContext()
		self.wordbatch = wordbatch.WordBatch(normalize_text, backend="spark", backend_handle=self.sc,
		                                     extractor=(WordBag, {"hash_ngrams":3,
		                                                          "hash_ngrams_weights":[-1.0, -1.0, 1.0],
		                                                          "hash_size":2**23, "norm":'l2',
		                                                          "tf":'binary', "idf":50.0}))
		self.clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 23, iters=1, inv_link="identity")
		if datadir==None:  (self.wordbatch, self.clf)= pkl.load(gzip.open(pickle_model, 'rb'))
		else: self.train(datadir, pickle_model)
コード例 #2
0
    def __init__(self, pickle_model="", datadir=None):
        self.maxlen = 100
        self.n_words = 100000
        parser = NeonArgparser(__doc__)
        self.args = parser.parse_args()
        self.args.batch_size = self.batch_size = 2048  #
        self.args.deterministic = None
        self.args.rng_seed = 0
        print extract_valid_args(self.args, gen_backend)
        self.be = gen_backend(**extract_valid_args(self.args, gen_backend))

        embedding_dim = 100
        init_emb = Uniform(-0.1 / embedding_dim, 0.1 / embedding_dim)
        init_glorot = GlorotUniform()
        self.layers = [
            LookupTable(vocab_size=self.n_words,
                        embedding_dim=embedding_dim,
                        init=init_emb,
                        pad_idx=0,
                        update=True,
                        name="LookupTable"),
            Dropout(keep=0.5),
            BiLSTM(100,
                   init=init_glorot,
                   activation=Tanh(),
                   gate_activation=Logistic(),
                   reset_cells=True,
                   split_inputs=False,
                   name="BiLSTM"),
            RecurrentMean(),
            Affine(1,
                   init_glorot,
                   bias=init_glorot,
                   activation=Identity(),
                   name="Affine")
        ]

        self.wordbatch = wordbatch.WordBatch(normalize_text,
                                             n_words=self.n_words,
                                             extractors=[(wordbatch.WordSeq, {
                                                 "seq_maxlen":
                                                 self.maxlen
                                             })])

        if datadir == None:
            self.model = Model(self.layers)
            self.model.load_params(pickle_model)
            self.wordbatch = pkl.load(gzip.open(pickle_model + ".wb", 'rb'))
        else:
            self.train(datadir, pickle_model)
コード例 #3
0
 def __init__(self, pickle_model="", datadir=None):
     self.wb = wordbatch.WordBatch(normalize_text,
                                   extractor=(WordHash, {
                                       "decode_error": 'ignore',
                                       "n_features": 2**25,
                                       "non_negative": False,
                                       "ngram_range": (1, 2),
                                       "norm": 'l2'
                                   }))
     self.clf = Ridge(alpha=1.0, random_state=0)
     if datadir == None:
         (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
     else:
         self.train(datadir, pickle_model)
コード例 #4
0
 def __init__(self, pickle_model="", datadir=None):
     self.wb = wordbatch.WordBatch(normalize_text,
                                   stemmer=stemmer,
                                   extractor=(WordHash, {
                                       "decode_error": 'ignore',
                                       "n_features": 2**25,
                                       "non_negative": False,
                                       "ngram_range": (1, 2),
                                       "norm": 'l2'
                                   }))
     self.clf = FM_FTRL(D=2**25,
                        D_fm=4,
                        iters=1,
                        inv_link="identity",
                        threads=multiprocessing.cpu_count() // 2)
     if datadir == None:
         (self.wb, self.clf) = pkl.load(gzip.open(pickle_model, 'rb'))
     else:
         self.train(datadir, pickle_model)
コード例 #5
0
    def __init__(self, pickle_model="", datadir=None):
        seed = 10002
        session_conf = tf.ConfigProto(
            intra_op_parallelism_threads=multiprocessing.cpu_count() // 2,
            inter_op_parallelism_threads=1)
        os.environ['PYTHONHASHSEED'] = str(seed)
        np.random.seed(seed + 1)
        random.seed(seed + 2)
        tf.set_random_seed(seed + 3)
        K.set_session(
            tf.Session(graph=tf.get_default_graph(), config=session_conf))

        self.maxlen = 200
        self.max_words = 20000
        self.wb = wordbatch.WordBatch(normalize_text,
                                      max_words=self.max_words,
                                      extractor=(WordSeq, {
                                          "seq_maxlen": self.maxlen
                                      }))
        self.model = Sequential()
        self.model.add(
            Embedding(self.max_words + 2, 20, input_length=self.maxlen))

        self.model.add(
            Conv1D(activation="relu",
                   padding="same",
                   strides=1,
                   filters=10,
                   kernel_size=3))
        self.model.add(Dropout(0.5))
        self.model.add(BatchNormalization())
        self.model.add(GlobalMaxPooling1D())
        self.model.add(Dense(1))
        self.model.compile(loss='mean_squared_error',
                           optimizer='adam',
                           metrics=['mean_squared_error'])
        if datadir == None:
            self.model = load_model(pickle_model)
            self.wb = pkl.load(gzip.open(pickle_model + ".wb", 'rb'))
        else:
            self.train(datadir, pickle_model)
コード例 #6
0
def normalize_text(text):
    text = text.lower()
    text = nums_re.sub(" NUM ", text)
    text = " ".join([
        word for word in non_alphanums.sub(" ", text).strip().split()
        if len(word) > 1
    ])
    return text


maxlen = 200
max_words = 20000
wb = wordbatch.WordBatch(normalize_text,
                         max_words=max_words,
                         extractor=(WordSeq, {
                             "seq_maxlen": maxlen
                         }))

wb = wordbatch.WordBatch(
    normalize_text,
    extractor=(Hstack, [(WordVec, {
        "wordvec_file": "../../../data/word2vec/glove.twitter.27B.100d.txt.gz",
        "normalize_text": normalize_text,
        "encoding": "utf8"
    }),
                        (WordVec, {
                            "wordvec_file":
                            "../../../data/word2vec/glove.6B.50d.txt.gz",
                            "normalize_text": normalize_text,
                            "encoding": "utf8"
コード例 #7
0
class WBFmFtrlModel(object):
    wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word",
                                                     "lowercase": False, "n_features": D,
                                                     "norm": None, "binary": True})
                         , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0)
    clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,
              D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0)

    def __init__(self,train_files):
        self.train_files = train_files

    def predict(self,predict_file):
        p = None
        test_preds = []
        click_ids = []
        X = None
        for df_c in pd.read_csv(predict_file,engine='c',chunksize=batchsize,sep=","):
            str_array, labels, weights = df2csr(self.wb,df_c)
            click_ids+= df_c['click_id'].tolist()
            del(df_c)
            if p != None:
                test_preds += list(p.join())
                if X is not None:
                    del (X)
                    X = None
            gc.collect()
            X = self.wb.transform(str_array)
            del (str_array)
            p = ThreadWithReturnValue(target=predict_batch, args=(self.clf, X))
            p.start()

        if p != None:  test_preds += list(p.join())
        del(X)
        return click_ids, test_preds

    def train(self):
        p = None
        X = None
        rcount = 0
        for train_file in self.train_files:
            print("Train using file:{}".format(train_file))
            for df_c in pd.read_csv(train_file, engine='c', chunksize=batchsize,
                        #for df_c in pd.read_csv('../input/train.csv', engine='c', chunksize=batchsize,
                        sep=",", dtype=dtypes):
                rcount += len(df_c)
                #cpuStats()
                str_array, labels, weights= df2csr(self.wb, df_c, pick_hours={4, 5, 10, 13, 14})
                del(df_c)
                if p != None:
                    p.join()
                    if X is not None:
                        del(X)
                        X = None
                gc.collect()
                X= self.wb.transform(str_array)
                del(str_array)
                if rcount % (2 * batchsize) == 0:
                    if p != None:  p.join()
                    p = threading.Thread(target=evaluate_batch, args=(self.clf, X, labels, rcount))
                    p.start()
                print("Training", rcount, time.time() - start_time)
                cpuStats()
                if p != None:  p.join()
                p = threading.Thread(target=fit_batch, args=(self.clf, X, labels, weights))
                p.start()
                if p != None:  p.join()

                del(X)
                X = None
コード例 #8
0
def main():
    feature_vectorized_file_name = 'Data/feature_vectorized2'
    if os.path.exists(feature_vectorized_file_name) == False:
        sparse_merge, price = _load(feature_vectorized_file_name)
        print(sparse_merge.shape)
    else:
        ########################################################################
        start_time = time.time()
        merge, submission, price = get_extract_feature()
        merge = merge[:TRAIN_SIZE]

        #merge['item_condition_id'] = merge['item_condition_id'].astype('category')
        # print('[{}] Convert categorical completed'.format(time.time() - start_time))
        #
        # # vectorize features
        # wb = CountVectorizer()
        # X_category2 = wb.fit_transform(merge['category_2'])
        # X_category3 = wb.fit_transform(merge['category_name'])
        # X_brand2 = wb.fit_transform(merge['brand_name'])
        # print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))
        #
        # lb = LabelBinarizer(sparse_output=True)
        # X_brand = lb.fit_transform(merge['brand_name'])
        # X_category1 = lb.fit_transform(merge['category_1'])
        # X_category4 = lb.fit_transform(merge['category_name'])
        # print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))
        #
        # X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']], sparse=True).values)
        #
        # # hand feature
        # for col in merge.columns:
        #     if ('Len' in col) or ('Frec' in col):
        #         merge[col] = np.log1p(merge[col])
        #         merge[col] = merge[col] / merge[col].max()
        #
        # hand_feature = ['brand_name_Frec', 'item_description_wordLen', 'brand_name_name_Intsct',
        #                 'brand_name_item_description_Intsct']
        # X_hand_feature = merge[hand_feature].values
        #
        name_w1 = param_space_best_WordBatch['name_w1']
        name_w2 = param_space_best_WordBatch['name_w2']
        desc_w1 = param_space_best_WordBatch['desc_w1']
        desc_w2 = param_space_best_WordBatch['desc_w2']
        #
        # wb = wordbatch.WordBatch(normalize_text=None, extractor=(WordBag, {
        #     "hash_ngrams": 2,
        #     "hash_ngrams_weights": [name_w1, name_w2],
        #     "hash_size": 2 ** 28,
        #     "norm": None,
        #     "tf": 'binary',
        #     "idf": None,
        # }), procs=8)
        # wb.dictionary_freeze = True
        # X_name = wb.fit_transform(merge['name'])
        # del (wb)
        # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 2, 0, 1), dtype=bool)]
        # print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))


        merge['item_description'] = merge['category_2'].map(str)+' E '+\
                                    merge['name'].map(str)+' E '+\
                                    merge['item_description'].map(str)

        wb = wordbatch.WordBatch(normalize_text=None,
                                 extractor=(WordBag, {
                                     "hash_ngrams":
                                     3,
                                     "hash_ngrams_weights":
                                     [desc_w1, desc_w2, 0.7],
                                     "hash_size":
                                     2**28,
                                     "norm":
                                     "l2",
                                     "tf":
                                     1.0,
                                     "idf":
                                     None
                                 }),
                                 procs=8)
        wb.dictionary_freeze = True
        X_description = wb.fit_transform(merge['item_description'])
        del (wb)
        X_description = X_description[:,
                                      np.array(np.clip(
                                          X_description.getnnz(axis=0) -
                                          6, 0, 1),
                                               dtype=bool)]
        print(
            '[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                  start_time))
        print(X_description.shape)

        sparse_merge = hstack((X_dummies, X_brand, X_brand2, X_category1,
                               X_category2, X_category3, X_category4,
                               X_hand_feature, X_name, X_description)).tocsr()

        print(X_dummies.shape, X_brand.shape, X_brand2.shape,
              X_category1.shape, X_category2.shape, X_category3.shape,
              X_category4.shape, X_hand_feature.shape, X_name.shape,
              X_description.shape, sparse_merge.shape)

        _save(feature_vectorized_file_name, [sparse_merge, price])
        print('[{}] data saved.'.format(time.time() - start_time))

    ########################################################################
    # use hyperopt to find the best parameters of the model
    # use 3 fold cross validation

    # learner_name='best_FTRL'
    # learner_name='FTRL'
    learner_name = 'best_FM_FTRL'
    #learner_name='FM_FTRL'
    print(learner_name)
    logname = "[Learner@%s]_hyperopt_%s.log" % (learner_name,
                                                time_utils._timestamp())
    logger = logging_utils._get_logger('Log', logname)
    logger.info('start')

    optimizer = TaskOptimizer(learner_name, sparse_merge, price, logger)
    optimizer.run()

    a = 12
コード例 #9
0
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')
    glove_file = '../feat/glove.6B.50d.txt'
    threads = 4
    save_dir = '../feat'
       
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    
    '''
    ix = (merge['brand_name']==merge['brand_name']) & \
            (~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' +merge['name'][ix]
    '''
    
    
    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]), random_state=233, train_size=0.90)
    
    del train
    del test
    gc.collect()
    
    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))
    
    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))
    
    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))
    
    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() - start_time))
    
    '''
    Encode Original Strings
    '''
    '''
    for col in ['item_description', 'name']:    
        wb = CountVectorizer()
        if 'X_orig' not in locals():
            X_orig = wb.fit_transform(merge[col])
        else:
            X_orig = hstack((X_orig, wb.fit_transform(merge[col])))
        print ('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocsr()
    
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 3, 0, 1), dtype=bool)]
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 100, 1, 0), dtype=bool)]    
    print ('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocoo()
    '''
    
    '''
    Stemmer
    '''
    
    # https://github.com/skbly7/usefulness/blob/ed11cd55080d553cf62873999a5e00b154057fbc/textpreprocess.py
    from nltk.tokenize import WordPunctTokenizer    # This is better for sentences containing unicode, like: u"N\u00faria Espert"
    word_tokenize = WordPunctTokenizer().tokenize
    import Stemmer
    import string
    ps = Stemmer.Stemmer("english")
    _wsre = re.compile("\s+")
    _alphanumre = re.compile("[\w\-\' ]", re.UNICODE)
    def _removestopwords(txtwords):
        global stoplist
    #    stoplist = stopwords.words("english")
        if stoplist is None:
            stoplist = frozenset([string.strip(l) for l in open(STOPFILE).readlines()])
        return [[w for w in t if w not in stoplist] for t in txtwords]
    
    def _stem(txtwords):
        return [stemmer.stemWords(t) for t in txtwords]
    
    def _removenonalphanumericchars(txtwords):
        return [[string.join([c for c in w if _alphanumre.search(c) is not None], "") for w in t] for t in txtwords]
    
    
    def _stripallwhitespace(txts):
        return [_wsre.sub("", txt) for txt in txts]
    stemmer = Stemmer.Stemmer("english")

    def textpreprocess(txt, 
                       sentencetokenize=False, 
                       replacehyphenbyspace=True, 
                       wordtokenize=False,
                       lowercase=True,
                       stem=True, 
                       removenonalphanumericchars=True, 
                       stripallwhitespace=True):
        """
        Note: For html2text, one could also use NCleaner (common.html2text.batch_nclean)
        Note: One could improve the sentence tokenization, by using the
        original HTML formatting in the tokenization.
        Note: We use the Porter stemmer. (Optimization: Shouldn't rebuild
        the PorterStemmer object each time this function is called.)
        """
    
        if sentencetokenize:
            txts = nltk.word_tokenize(txt)
            #txts = tokenizer.tokenize(txt.split())
        else:
            txts = txt.split()
        txt = None
        
        if replacehyphenbyspace:
            txts = [t.replace("-", " ") for t in txts]
    
        if wordtokenize:
            txtwords = [word_tokenize(t) for t in txts]
        else:
            txtwords = [string.split(t) for t in txts]
        txts = None
    
        if lowercase:
            txtwords = [[string.lower(w) for w in t] for t in txtwords]
    
        if stem:
            txtwords = _stem(txtwords)
    
        # TODO: Maybe remove Unicode accents? http://stackoverflow.com/questions/517923/what-is-the-best-way-to-remove-accents-in-a-python-unicode-string
    
        if removenonalphanumericchars:
            txtwords = _removenonalphanumericchars(txtwords)
    
        txtwords = [[w for w in t if w != ""] for t in txtwords]
    
        txts = [string.join(words) for words in txtwords]
    
        if stripallwhitespace:
            for _ in range(2):
                txts = _stripallwhitespace(txts)

        return string.join(txts, sep=" ")

    print('[{}] Start stemming'.format(time.time() - start_time))
    merge['stem_name'] =  [textpreprocess(s) for s in merge["name"].values]
    print('[{}] Stemming completed'.format(time.time() - start_time))
    
    '''
    Crossed columns
    '''
    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns
    
    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
              ['brand_name',  'item_condition_id_str'],
              ['brand_name',  'subcat_1'],
              ['brand_name',  'subcat_2'],
              ['brand_name',  'general_cat'],
              #['brand_name',  'subcat_1',  'item_condition_id_str'],
              #['brand_name',  'subcat_2',  'item_condition_id_str'],
              #['brand_name',  'general_cat',  'item_condition_id_str'],
              ['brand_name',  'shipping_str'],
              ['shipping_str',  'item_condition_id_str'],
              ['shipping_str',  'subcat_2'],
              ['item_condition_id_str',  'subcat_2']          
              )
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(
        merge.select_dtypes(include=['object']).columns)
    
    D = 2**30
    for k, v in crossed_columns_d.items():
        print ('Crossed column ', k)
        outls_ = []
        indicator = 0 
        for col in v:
            outls_.append((np.array(merge[col].apply(hash)))%D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del(lb)
    
    
    '''
    Hash name
    '''
    
    
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                                                  "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    X_name = wb.fit_transform(merge['name'])
    del(wb)
    X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))    
    
    '''
    Hash category
    '''
    
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 20, "norm": None, "tf": 'binary',
                                                                  "idf": None,
                                                                  }), procs=8)
    wb.dictionary_freeze= True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del(wb)
    X_cat = X_cat[:, np.array(np.clip(X_cat.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() - start_time))
    
    '''
    Count category
    '''
    
    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))
    
    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                                                  "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                                                  "idf": None})
                             , procs=8)
    wb.dictionary_freeze= True
    X_description = wb.fit_transform(merge['item_description'])
    del(wb)
    X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() - start_time))
    
    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() - start_time))
    
    X_dummies = csr_matrix(pd.get_dummies(merge[['item_condition_id', 'shipping']],
                                          sparse=True).values)
    
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_stem_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat,
                           x_col, X_stem_name)).tocsr()
    
    
    print('[{}] Create sparse merge completed'.format(time.time() - start_time))
    
    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    
    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[trnidx], y.values[validx]
        
    model = FM_FTRL(alpha=0.005, beta=0.005, L1=0.00001, L2=0.1, D=sparse_merge.shape[1], alpha_fm=0.005, L2_fm=0.0, init_fm=0.01,
                    D_fm=200, e_noise=0.0001, iters=1, inv_link="identity", threads=threads) #iters=15
    
    baseline = 1.
    for i in range(15):
        model.fit(train_X , train_y , verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline:
            baseline = score_
        else:
            break
        
    
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(predsfm)))
        # 0.44532 
        # Full data 0.424681
    
    
    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    
    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
コード例 #10
0
    test_ids = kaggle_test_df['id']
    test_size = kaggle_test_df.shape[0]

    if os.path.isfile(df_path):
        print('Preprocessed file found! Loading preprocessed Dataset')
        df_full = pd.read_csv(df_path)
    else:
        print('No preprocessed file found, start preprocessing')
        df_full = preprocessing(df, kaggle_test_df)

    wb = wordbatch.WordBatch(normalize_text
                             , extractor=(WordBag, {"hash_ngrams": 2,
                                                    "hash_ngrams_weights": [0.5, -1.0],
                                                    "hash_size": 2 ** 23,
                                                    "norm": 'l2',
                                                    "tf": 'log',
                                                    "idf": 10.0}
                                          )
                             , procs=8)

    wb.dictionary_freeze = True

    X_title = wb.transform(df_full['stemmed_title'])
    # X_title = X_title[:, np.array(np.clip(X_title.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print("Xtitle shape", X_title.shape)

    X_text = wb.transform(df_full['stemmed_text'])
    # X_text = X_text[:, np.array(np.clip(X_text.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print("X_text shape", X_text.shape)
コード例 #11
0
ファイル: WBFmFtrl22.py プロジェクト: alanshu2018/Blazer
class WBFmFtrlModel(object):
    wb = wordbatch.WordBatch(None,
                             extractor=(WordHash, {
                                 "ngram_range": (1, 1),
                                 "analyzer": "word",
                                 "lowercase": False,
                                 "n_features": D,
                                 "norm": None,
                                 "binary": True
                             }),
                             minibatch_size=batchsize // 80,
                             procs=8,
                             freeze=True,
                             timeout=1800,
                             verbose=0)

    #clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,
    #          D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0)

    def __init__(self, train_file, test_file):
        self.train_file = train_file
        self.test_file = test_file
        self.clf = None

    def create_clf(self):
        if self.clf is not None:
            del (self.clf)
            gc.collect()
        self.clf = FM_FTRL(alpha=0.05,
                           beta=0.1,
                           L1=0.0,
                           L2=0.0,
                           D=D,
                           alpha_fm=0.02,
                           L2_fm=0.0,
                           init_fm=0.01,
                           weight_fm=1.0,
                           D_fm=16,
                           e_noise=0.0,
                           iters=5,
                           inv_link="sigmoid",
                           e_clip=1.0,
                           threads=4,
                           use_avx=1,
                           verbose=0)

    def predict(self, predict_file):
        p = None
        test_preds = []
        click_ids = []
        X = None
        for df_c in pd.read_csv(predict_file,
                                engine='c',
                                chunksize=batchsize,
                                sep=",",
                                usecols=predictors + ["click_id", "weight"]):
            str_array = df2csr(df_c[predictors].values)
            labels = df_c["click_id"].values
            weights = df_c["weight"].values
            click_ids += df_c['click_id'].tolist()
            del (df_c)
            if p != None:
                test_preds += list(p.join())
                if X is not None:
                    del (X)
                    gc.collect()
            X = self.wb.transform(str_array)
            del (str_array)
            p = ThreadWithReturnValue(target=predict_batch, args=(self.clf, X))
            p.start()

        if p != None: test_preds += list(p.join())
        if X is not None:
            del (X)
            gc.collect()
        return click_ids, test_preds

    def read_data_file(self, train_file, skip_rows, nrows):
        if skip_rows > 0:
            skip_rows = range(1, skip_rows)
        else:
            skip_rows = None
        df_c = pd.read_csv(train_file,
                           skiprows=skip_rows,
                           nrows=nrows,
                           engine="c",
                           dtype=dtypes,
                           usecols=predictors + ["weight", "click_id"])
        str_array = df2csr((df_c[predictors].values))
        X = self.wb.transform(str_array)
        labels = df_c["click_id"].values
        weights = df_c["weight"].values
        del (str_array)
        del (df_c)
        gc.collect()
        return X, labels, weights

    def predict_data(self, X, labels, weights):
        return predict_batch(self.clf, X)

    def train_all(self):
        p = None
        X = None
        rcount = 0
        if True:
            start_time = time.time()

            self.create_clf()

            print("Train using file:{}".format(self.train_file))
            print("Pretrain the model")
            start = 24903889
            start_loops = int(start / batchsize)
            pos = 0
            for i in range(start_loops + 1):
                if p != None:
                    p.join()
                    if X is not None:
                        del (X)
                        X = None
                        del (labels)
                        del (weights)
                        gc.collect()
                nrows = batchsize
                if pos + batchsize > start:
                    nrows = start - pos + 1

                if nrows <= 1:
                    break

                print("Pretrain: pos={}, nrows={}".format(pos, nrows))
                if pos <= 0:
                    X, labels, weights = self.read_data_file(
                        self.train_file, 0, nrows)
                    pos += nrows
                else:
                    skip = pos - batchsize
                    X, labels, weights = self.read_data_file(
                        self.train_file, skip, nrows)
                    pos += nrows
                p = threading.Thread(target=fit_batch,
                                     args=(self.clf, X, labels, weights))
                p.start()

            rcount += start
            print("Training", rcount, time.time() - start_time)
            # First train
            tv = [batchsize, batchsize * 2, batchsize * 3, batchsize * 4]
            for idx, pos in enumerate(tv):
                skip = start + pos - batchsize
                if p != None:
                    p.join()
                    if X is not None:
                        del (X)
                        X = None
                        del (labels)
                        del (weights)
                        gc.collect()
                X, labels, weights = self.read_data_file(
                    self.train_file, skip, batchsize)
                rcount += batchsize
                if idx >= 1:
                    if p != None: p.join()
                    p = threading.Thread(target=evaluate_batch,
                                         args=(self.clf, X, labels, rcount))
                    p.start()
                if p != None: p.join()
                print("Training", rcount, time.time() - start_time)
                p = threading.Thread(target=fit_batch,
                                     args=(self.clf, X, labels, weights))
                p.start()

            if p != None: p.join()
            if X is not None:
                del (X)
                X = None
                del (labels)
                del (weights)
                gc.collect()

    def train_cv(self):
        p = None
        X = None
        rcount = 0
        if True:
            start_time = time.time()

            train_valids = [
                [batchsize, batchsize * 2, batchsize * 3, batchsize * 4],
                [batchsize * 2, batchsize * 3, batchsize * 4, batchsize],
                [batchsize, batchsize * 3, batchsize * 4, batchsize * 2],
                [batchsize, batchsize * 2, batchsize * 4, batchsize * 3],
            ]

            all_cv_preds = np.zeros(shape=(4 * batchsize, ), dtype=np.float16)
            for tv in train_valids:
                print("Train_CV: tv={}".format(tv))
                self.create_clf()

                print("Train using file:{}".format(self.train_file))
                print("Pretrain the model")
                start = 24903889
                start_loops = int(start / batchsize)
                pos = 0
                for i in range(start_loops + 1):
                    if p != None:
                        p.join()
                        if X is not None:
                            del (X)
                            X = None
                            del (labels)
                            del (weights)
                            gc.collect()
                    nrows = batchsize
                    if pos + batchsize > start:
                        nrows = start - pos + 1

                    if nrows <= 1:
                        break

                    print("Pretrain: pos={}, nrows={}".format(pos, nrows))
                    if pos <= 0:
                        X, labels, weights = self.read_data_file(
                            self.train_file, 0, nrows)
                        pos += nrows
                    else:
                        skip = pos - batchsize
                        X, labels, weights = self.read_data_file(
                            self.train_file, skip, nrows)
                        pos += nrows
                    p = threading.Thread(target=fit_batch,
                                         args=(self.clf, X, labels, weights))
                    p.start()

                rcount += start
                print("Training", rcount, time.time() - start_time)
                # First train
                for idx, pos in enumerate(tv[:3]):
                    skip = start + pos - batchsize
                    if p != None:
                        p.join()
                        if X is not None:
                            del (X)
                            X = None
                            del (labels)
                            del (weights)
                            gc.collect()
                    X, labels, weights = self.read_data_file(
                        self.train_file, skip, batchsize)
                    rcount += batchsize
                    if idx % 2 == 1:
                        if p != None: p.join()
                        p = threading.Thread(target=evaluate_batch,
                                             args=(self.clf, X, labels,
                                                   rcount))
                        p.start()
                    if p != None: p.join()
                    print("Training", rcount, time.time() - start_time)
                    p = threading.Thread(target=fit_batch,
                                         args=(self.clf, X, labels, weights))
                    p.start()

                if p != None: p.join()
                if X is not None:
                    del (X)
                    X = None
                    del (labels)
                    del (weights)
                    gc.collect()

                print("Predict for the validation data")
                pos = tv[3]
                skip = start + pos - batchsize
                X, labels, weights = self.read_data_file(
                    self.train_file, skip, batchsize)
                pred = predict_batch(self.clf, X)
                all_cv_preds[pos - batchsize:pos] = np.reshape(
                    pred, (batchsize, ))
                if X is not None:
                    del (X)
                    X = None
                    del (labels)
                    del (weights)
                    gc.collect()

            # Save cv result data
            fname = "%s/cv_pred_%s_%s.csv" % (config.OUTPUT_DIR, "fmftrl", Ver)
            print("Save cv predictions:{}".format(fname))
            df = pd.DataFrame({"predicted": all_cv_preds})
            df.to_csv(fname, index=False, columns=["predicted"])
コード例 #12
0
class WBFmFtrlModel(object):
    wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word",
                                                     "lowercase": False, "n_features": D,
                                                     "norm": None, "binary": True})
                         , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0)
    #clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,
    #          D_fm=8, e_noise=0.0, iters=3, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0)

    def __init__(self,pretrain_files,train_file, test_file):
        self.pretrain_files = pretrain_files
        self.train_file = train_file
        self.test_file = test_file
        self.clf = None
        self.pretrain_model_fn = "wb_fmftrl_v26_pretrain.model"

    def create_clf(self):
        if self.clf is not None:
            del(self.clf)
            gc.collect()
        self.clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=0.02, L2_fm=0.0, init_fm=0.01, weight_fm=1.0,
                      D_fm=16, e_noise=0.0, iters=5, inv_link="sigmoid", e_clip=1.0, threads=4, use_avx=1, verbose=0)

    def get_data(self, loader, fold= -1, chunk_size=10000000, file_size=40000000):
        if fold > 0:
            size_per_fold = int(file_size/fold)
        else:
            size_per_fold = chunk_size

        for (idx, df) in loader.get_chunk_data():
            data = df[predictors].values
            labels = df['click_id'].values
            weights = df['weight'].values
            if fold == -1:
                fold_num = -1
            else:
                fold_num = int(idx / size_per_fold)
            del(df)
            gc.collect()

            str_array = df2csr(data)
            X = self.wb.transform(str_array)
            del(str_array)
            del(data)
            gc.collect()
            yield (idx, fold_num, X, labels, weights)

    def do_thread_execute(self,target,clf, X, labels=None, weights=None,do_free=True):
        #str_array = df2csr(data)
        #gc.collect()
        #X = self.wb.transform(str_array)
        if labels is not None:
            args = (clf, X, labels, weights)
        else:
            args = (clf, X)
        p = ThreadWithReturnValue(target=target,args =args)
        p.start()
        ret = p.join()
        if do_free:
            del(X)
            if labels is not None:
                del(labels)
            if weights is not None:
                del(weights)
        gc.collect()

        return ret

    def predict(self,predict_file):
        test_preds = []
        click_ids = []
        test_loader = DataPiper(predict_file,logger)
        for (idx, fold_num, X, labels, weights) in self.get_data(test_loader):
            click_ids+= labels.tolist()
            test_preds += list(self.do_thread_execute(predict_batch,self.clf,X))

        return click_ids, test_preds

    def predict_data(self, X, labels, weights):
        return predict_batch(self.clf, X)

    def pretrain(self):
        p = None
        X = None
        rcount = 0

        start_time = time.time()

        self.create_clf()

        if not os.path.exists(self.pretrain_model_fn):
            print("Pretrain the model")
            for pretrain_file in self.pretrain_files:
                print("Pretrain using file:{}".format(pretrain_file))
                loader = DataPiper(pretrain_file,logger)
                for (idx, fold_num, X, labels, weights) in self.get_data(loader):
                    self.do_thread_execute(fit_batch,self.clf,X,labels,weights)

            with open(self.pretrain_model_fn,"wb") as f:
                params = self.clf.__getstate__() #self.create_clf()
                pkl.dump(params,f)
            #self.clf.pickle_model(self.pretrain_model_fn)
        else:
            with open(self.pretrain_model_fn,"rb") as f:
                params = pkl.load(f)
                self.clf.__setstate__(params)
            #self.clf.unpickle_model(self.pretrain_model_fn)

    def train_all(self):
        p = None
        X = None
        rcount = 0

        start_time = time.time()

        self.create_clf()

        print("Pretrain the model")
        self.pretrain()
        """
        for pretrain_file in self.pretrain_files:
            print("Pretrain using file:{}".format(pretrain_file))
            loader = DataPiper(pretrain_file,logger)
            for (idx, fold_num, X, labels, weights) in self.get_data(loader):
                self.do_thread_execute(fit_batch,self.clf,X,labels,weights)
        """

        print("Train with file={}".format(self.train_file))
        rcount = 0
        loader = DataPiper(self.train_file,logger)
        loops = 0
        for (idx, fold_num, X, labels, weights) in self.get_data(loader):
            if loops % 2 == 0:
                self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights, do_free=False)
            loops += 1
            rcount += len(labels)

            print("Training", rcount, time.time() - start_time)
            self.do_thread_execute(fit_batch,self.clf,X,labels,weights)

    def train_cv(self):
        start_time = time.time()

        nfold = 4
        train_preds = []
        auc_cv = [0.0 for _ in range(nfold)]
        for fold in range(nfold):
            self.create_clf()
            print("Pretrain models")
            self.pretrain()
            """
            for pretrain_file in self.pretrain_files:
                print("Pretrain using file:{}".format(pretrain_file))
                loader = DataPiper(pretrain_file,logger)
                for (idx, fold_num, X, labels, weights) in self.get_data(loader):
                    self.do_thread_execute(fit_batch,self.clf,X,labels,weights)
            """
            print("Train with file={}".format(self.train_file))
            file_size = 40000000
            all_cv_preds = np.zeros(shape=(file_size,),dtype=np.float32)
            loader = DataPiper(self.train_file,logger)
            valid_datas = []
            loops = 0
            rcount = 0
            for (idx, fold_num, X, labels, weights) in self.get_data(loader,fold=nfold,file_size=file_size):
                print("fold_num={},fold={},nfold={}".format(fold_num,fold,nfold))
                if fold_num == fold:
                    valid_datas.append((idx,fold_num,X,labels,weights))
                    print("Add valid_datas:len={}".format(len(valid_datas)))
                    continue

                loops += 1
                rcount += len(labels)
                if loops % 2 == 0:
                    self.do_thread_execute(evaluate_batch,self.clf,X,labels,weights,do_free=False)

                print("Training", rcount, time.time() - start_time)
                self.do_thread_execute(fit_batch,self.clf,X,labels,weights)

            print("Predict for the validation data")
            print("Valid_datas:len={}".format(len(valid_datas)))
            valid_start_idx = valid_datas[0][0]
            valid_labels = []
            valid_weights = []
            valid_ds = []
            for d in valid_datas:
                valid_labels.append(d[3])
                valid_weights.append(d[4])
                valid_ds.append(d[2])
                #print("Valid_ds:d.len={},valid_ds.len={}".format(len(d[2]),len(valid_ds)))
            num = len(valid_labels)
            if num > 1:
                valid_weights = np.concatenate(valid_weights,axis=0)
                valid_labels = np.concatenate(valid_labels, axis=0)
                from scipy.sparse import hstack
                #valid_ds = np.concatenate(valid_ds,axis=0)
                valid_ds = hstack(valid_ds,axis=0)
            else:
                valid_labels = valid_labels[0]
                valid_weights = valid_weights[0]
                valid_ds = valid_ds[0]
            y_pred = self.do_thread_execute(predict_batch,self.clf,valid_ds)
            num = len(valid_labels)
            y_pred = np.reshape(y_pred,(num,))
            print("y_pred.shape={}".format(y_pred.shape))
            print("valid_labels.shape={}".format(valid_labels.shape))
            valid_labels = np.reshape(valid_labels,(num,))
            train_preds.append((valid_start_idx,num,y_pred))
            auc_cv[fold] = dist_utils._auc(valid_labels, y_pred)
            logger.info("      {:>3}    {:>8}    {} x {}".format(
                fold+1, np.round(auc_cv[fold],6), valid_ds.shape[0], valid_ds.shape[1]))

            #clean up
            del(valid_datas)
            del(valid_ds)
            del(valid_labels)
            del(valid_weights)
            gc.collect()

        # Save cv result data
        fname = "%s/cv_pred_%s_%s.csv"%(config.OUTPUT_DIR, "fmftrl",Ver)
        print("Save cv predictions:{}".format(fname))
        df = pd.DataFrame({"predicted": all_cv_preds})
        df.to_csv(fname, index=False, columns=["predicted"])
コード例 #13
0
ファイル: 2501_gru_wb.py プロジェクト: satadru5/mercari
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')

    glove_file = '../feat/glove.6B.50d.txt'
    threads = 8
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    ix = (merge['brand_name'] == merge['brand_name']) & (
        ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(
            merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix]

    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))
    ''' 
    Regex characteristics - carat, gb/tb, cpu
    '''
    def count_rgx(regexls, idx_, filter_=None):
        colvals = merge['name'][idx_] + ' ' + merge['item_description'][idx_]
        vals = pd.Series(np.zeros(len(colvals)))
        for rgx_ in regexls:
            valsls = colvals.str.findall(rgx_, re.IGNORECASE)
            vals[vals == 0] += pd.Series([
                int(v[0]) if len(set(v)) == 1 else 0 for v in valsls
            ])[vals == 0]
        if filter_:
            vals[~vals.isin(filter_)] = 0.
        return vals

    def count_rgx_name(regexls, idx_, filter_=None):
        colvals = merge['name'][idx_]
        vals = pd.Series(np.zeros(len(colvals)))
        for rgx_ in regexls:
            valsls = colvals.str.findall(rgx_, re.IGNORECASE)
            vals[vals == 0] += pd.Series(
                [int(v[0]) if len(v) != 0 else 0 for v in valsls])[vals == 0]
        if filter_:
            vals[~vals.isin(filter_)] = 0.
        return vals

    # gold
    measures = np.zeros((merge.shape[0], 4))
    ix_chk = ((merge.name.str.contains('gold', case=False)) | \
                (merge.item_description.str.contains('gold', case=False))) & \
                (merge['subcat_1'] == 'Jewelry')
    rgxls = [
        r"(\d+)k ", r"(\d+)kt ", r"(\d+)k.", r"(\d+)kt.", r"(\d+)k,",
        r"(\d+)kt,", r"(\d+) k ", r"(\d+) kt", r"(\d+) k.", r"(\d+) kt.",
        r"(\d+) k,", r"(\d+) kt,"
    ]
    measures[ix_chk,
             0] = count_rgx(rgxls,
                            ix_chk,
                            filter_=[10, 12, 14, 16, 18, 20, 21, 22, 23, 24])

    # phone memory
    ix_chk = (merge['subcat_2'] == 'Cell Phones & Smartphones')
    rgxls = [
        r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,",
        r"(\d+) gb,"
    ]
    measures[ix_chk, 1] = count_rgx(rgxls, ix_chk)

    # console memory
    ix_chk = (merge['subcat_2'] == 'Consoles')
    rgxls = [
        r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,",
        r"(\d+) gb,"
    ]
    measures[ix_chk, 2] = count_rgx(rgxls, ix_chk)

    # computer memory
    ix_chk = (merge['category_name'] == 'Electronics/Computers & Tablets/Laptops & Netbooks') | \
        (merge['category_name'] == 'Electronics/Computers & Tablets/Desktops & All-In-Ones')
    rgxls = [
        r"(\d+)gb ", r"(\d+) gb", r"(\d+)gb.", r"(\d+) gb.", r"(\d+)gb,",
        r"(\d+) gb,"
    ]
    measures[ix_chk, 3] = count_rgx(rgxls, ix_chk)

    # cpu

    # oz

    # diamond
    #r"(\d+) karat ", r"(\d+) carat "
    '''
    Crossed columns
    '''

    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns

    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
        ['brand_name', 'item_condition_id_str'],
        ['brand_name', 'subcat_1'],
        ['brand_name', 'subcat_2'],
        ['brand_name', 'general_cat'],
        #['brand_name',  'subcat_1',  'item_condition_id_str'],
        #['brand_name',  'subcat_2',  'item_condition_id_str'],
        #['brand_name',  'general_cat',  'item_condition_id_str'],
        ['brand_name', 'shipping_str'],
        ['shipping_str', 'item_condition_id_str'],
        ['shipping_str', 'subcat_2'],
        ['item_condition_id_str', 'subcat_2'])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(merge.select_dtypes(include=['object']).columns)

    D = 2**30
    for k, v in crossed_columns_d.items():
        print('Crossed column ', k)
        outls_ = []
        indicator = 0
        for col in v:
            outls_.append((np.array(merge[col].apply(hash))) % D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del (lb)
    '''
    Encode Original Strings
    '''
    '''
    for col in ['item_description', 'name']:    
        lb = LabelBinarizer(sparse_output=True)
        if 'X_orig' not in locals():
            X_orig = lb.fit_transform(merge[col].apply(hash))
        else:
            X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['item_description']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['subcat_2']+merge['name']).apply(hash))))
    X_orig = hstack((X_orig, lb.fit_transform((merge['brand_name']+merge['name']+merge['item_description']).apply(hash))))
    X_orig = X_orig.tocsr()
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 2, 0, 1), dtype=bool)]
    X_orig = X_orig[:, np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0), dtype=bool)]    
    print ('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocoo()
    '''
    gc.collect()
    cpuStats()
    '''
    Hash name
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**20,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del (wb)
    X_cat = X_cat[:,
                  np.array(np.clip(X_cat.getnnz(axis=0) -
                                   1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    X_memory = lb.fit_transform(merge['measure_memory'])
    mask = np.array(np.clip(X_memory.getnnz(axis=0) - 10**6, 1, 0), dtype=bool)
    X_memory = X_memory[:, mask]
    X_gold = lb.fit_transform(merge['measure_gold'])
    mask = np.array(np.clip(X_gold.getnnz(axis=0) - 10**6, 1, 0), dtype=bool)
    X_gold = X_gold[:, mask]
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    '''
    print(X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_orig.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name, X_cat,
                           x_col, X_orig)).tocsr()
    '''

    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_memory, X_gold)
    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_cat, x_col, X_memory, X_gold)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline:
            baseline = score_
        else:
            break

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
        # 0.44532
        # Full data 0.424681
        # 0.419741

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
コード例 #14
0
text_seq_cols = ["name", "item_description"]

fill_missing(all_data, text_cols, num_cols, bin_cols)

all_data["all_text"] = all_data["brand_name"].astype(str) + " " + all_data[
    "name"].astype(str) + " " + all_data['item_description']
all_data["name_brand"] = all_data["brand_name"].astype(
    str) + " " + all_data["name"].astype(str)

wb = wordbatch.WordBatch(
    normalize_text,
    extractor=(
        WordBag,
        {
            "hash_ngrams": 1,
            # "hash_ngrams_weights": [1.5, 1.0],
            "hash_size": 2**29,
            "norm": None,
            "tf": 'binary',
            "idf": None,
        }),
    procs=8)
wb.dictionary_freeze = True
X_all_text = wb.fit_transform(all_data['all_text'])
del (wb)
X_all_text = X_all_text[:,
                        np.array(np.clip(X_all_text.getnnz(axis=0) - 1, 0, 1),
                                 dtype=bool)]
print('[{}] Vectorize `all text` completed.'.format(time.time() - start_time))
print(X_all_text.shape)
コード例 #15
0
    dataset.loc[~dataset['subcat_1'].isin(pop_category2), 'subcat_1'] = 'missing'
    dataset.loc[~dataset['subcat_2'].isin(pop_category3), 'subcat_2'] = 'missing'
cutting(full_df)

stopwords = {x: 1 for x in stopwords.words('english')}
non_alphanums = re.compile(u'[^A-Za-z0-9]+')

def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if x not in stopwords])
         

wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag,\
                                {"hash_ngrams": 3, "hash_ngrams_weights": [1.6, 0.8, 0.4],
                                "hash_size": 2 ** 28, "norm": "l2", "tf": 'binary',
                                                              "idf": None,
                                                              }), procs=8)
wb.dictionary_freeze= True
X_name = wb.fit_transform(full_df['name'])
del(wb)
#X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
X_name = X_name[:, np.where(X_name.getnnz(axis=0) > 2)[0]]
#print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

wb = CountVectorizer()
X_category1 = wb.fit_transform(full_df['subcat_0'])
X_category2 = wb.fit_transform(full_df['subcat_1'])
X_category3 = wb.fit_transform(full_df['subcat_2'])
#print('[{}] Count vectorize `categories` completed.'.format(time.time() - start_time))
コード例 #16
0
import re


def normalize_text(text):
    return u" ".join(
        [x for x in [y for y in non_alphanums.sub(' ', text).lower().strip().split(" ")] \
         if len(x) > 1 and x not in stopwords])


merge = pd.read_pickle('merge.pkl')

wb = wordbatch.WordBatch(extractor=(WordBag, {
    "hash_ngrams": 2,
    "hash_ngrams_weights": [1.5, 1.0],
    "hash_size": 2**29,
    "norm": None,
    "tf": 'binary',
    "idf": None
}),
                         procs=0)

merge['name'] = merge['name'].map(lambda x: normalize_text(x))

wb.dictionary_freeze = True
X_name = wb.fit_transform(merge['name'])

del (wb)
X_name = X_name[:,
                np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
コード例 #17
0
ファイル: rscript436.py プロジェクト: darkblue-b/kaggleScape
def wordbatch_algo(test):
    import time

    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    # train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    # test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    train = pd.read_table('../input/train.tsv', engine='c')
    # test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge: pd.DataFrame = pd.concat([train, dftt, test])
    # submission: pd.DataFrame = test[['test_id']]
    '''
    # Mean of each group # https://stackoverflow.com/questions/30244952/python-pandas-create-new-column-with-groupby-sum
    cat_mean = train['price'].groupby(train['category_name']).mean()
    cat_mean = pd.DataFrame({'category_name':cat_mean.index, 'cat_mean':cat_mean.values})
    merge = merge.merge(cat_mean, on=['category_name'], how='left')
    # print(merge.head())
    X_cat_mean = merge['cat_mean'].as_matrix().reshape(-1, 1)
    # X_cat_mean = normalize(np.nan_to_num(X_cat_mean).reshape(-1, 1), norm='max')  
    
    
    cond_mean = train['price'].groupby(train['item_condition_id']).mean()
    cond_mean = pd.DataFrame({'item_condition_id':cond_mean.index, 'cond_mean':cond_mean.values})
    merge = merge.merge(cond_mean, on=['item_condition_id'], how='left')
    X_cond_mean = merge['cond_mean'].as_matrix().reshape(-1, 1)
    

    brand_mean = train['price'].groupby(train['brand_name']).mean()
    brand_mean = pd.DataFrame({'brand_name':brand_mean.index, 'brand_mean':brand_mean.values})
    merge = merge.merge(brand_mean, on=['brand_name'], how='left')
    X_brand_mean = merge['brand_mean'].as_matrix().reshape(-1, 1)
    

    ship_mean = train['price'].groupby(train['shipping']).mean()
    ship_mean = pd.DataFrame({'shipping':ship_mean.index, 'ship_mean':ship_mean.values})
    merge = merge.merge(ship_mean, on=['shipping'], how='left')
    X_ship_mean = merge['ship_mean'].as_matrix().reshape(-1, 1)
    '''

    del train
    del test
    gc.collect()



    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    # Add some new features:
    X_len_desc = merge['item_description'].apply(
        lambda x: len(x)).as_matrix().reshape(-1, 1)
    X_len_name = merge['name'].apply(lambda x: len(x)).as_matrix().reshape(
        -1, 1)

    # X_len_description = normalize(np.nan_to_num(X_len_description).reshape(-1, 1), norm='max')
    # X_len_name = normalize(np.nan_to_num(X_len_name).reshape(-1, 1), norm='max')

    print('[{}] Length `item_description` completed.'.format(time.time() -
                                                             start_time))
    print('[{}] Length `name` completed.'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(
        X_dummies.shape, X_description.shape, X_brand.shape, X_category1.shape,
        X_category2.shape, X_category3.shape, X_name.shape
    )  #, X_glove.shape, X_len_description.shape, X_len_name.shape, X_cat_mean.shape)
    # sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1, X_category2, X_category3, X_name)).tocsr()
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))
    del X_dummies, merge, X_description, lb, X_brand, X_category1, X_category2, X_category3, X_name
    gc.collect()

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED)

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=FM_iter,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    gc.collect()
    print('[{}] Train FM_FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))
    gc.collect()
    print(predsFM)

    #model = FTRL(alpha=0.01, beta=0.1, L1=0.00001, L2=1.0, D=sparse_merge.shape[1], iters=50, inv_link="identity", threads=1)
    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=FTRL_iter,
                 inv_link="identity",
                 threads=1)
    del X
    gc.collect()
    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))
    print(predsF)

    del train_X, train_y
    del X_test

    return predsFM, predsF
コード例 #18
0
ファイル: 3101_gru_wb_tst.py プロジェクト: satadru5/mercari
def getFMFTRL(moddict):
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')

    glove_file = '../feat/glove.6B.50d.txt'
    threads = 8
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()
    cpuStats()

    merge = prepFMFeatures(merge)
    cpuStats()
    merge.head()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    moddict['cross_cols'] = {}
    for i in range(0, len(cross_nm)):
        moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True)
        moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]])
        if i == 0:
            x_col = moddict['cross_cols'][cross_nm[i]].transform(
                merge[cross_nm[i]])
        else:
            x_col = hstack(
                (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform(
                    merge[cross_nm[i]])))
        del merge[cross_nm[i]]
    gc.collect()
    cpuStats()
    '''
    Hash name
    '''
    moddict['wb_name'] = wordbatch.WordBatch(normalize_text,
                                             extractor=(WordBag, {
                                                 "hash_ngrams":
                                                 2,
                                                 "hash_ngrams_weights":
                                                 [1.5, 1.0],
                                                 "hash_size":
                                                 2**29,
                                                 "norm":
                                                 None,
                                                 "tf":
                                                 'binary',
                                                 "idf":
                                                 None,
                                                 'verbose':
                                                 1,
                                             }),
                                             procs=8)
    moddict['wb_name'].dictionary_freeze = True
    X_name = moddict['wb_name'].fit_transform(merge['name'])
    moddict['wb_name_mask'] = np.array(np.clip(
        X_name[:nrow_train].getnnz(axis=0) - 1, 0, 1),
                                       dtype=bool)
    X_name = X_name[:, moddict['wb_name_mask']]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category #2
    '''
    moddict['wb_cat'] = wordbatch.WordBatch(normalize_text,
                                            extractor=(WordBag, {
                                                "hash_ngrams":
                                                2,
                                                "hash_ngrams_weights":
                                                [1.0, 1.0],
                                                "hash_size":
                                                2**20,
                                                "norm":
                                                None,
                                                "tf":
                                                'binary',
                                                "idf":
                                                None,
                                            }),
                                            procs=4)
    moddict['wb_cat'].dictionary_freeze = True
    ### This must be the full dataset
    cats = merge["category_name"].str.replace('/', ' ').unique()
    moddict['wb_cat'].fit(cats)
    X_cat_tmp = moddict['wb_cat'].transform(cats)
    moddict['wb_cat_dict'] = dict([
        (c, X_cat_tmp.getrow(row))
        for (c, row) in zip(cats.tolist(), range(len(cats)))
    ])
    X_cat = vstack(([
        moddict['wb_cat_dict'][c]
        for c in merge["category_name"].str.replace('/', ' ')
    ]))
    moddict['wb_cat_mask'] = np.array(np.clip(
        X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1),
                                      dtype=bool)
    X_cat = X_cat[:, moddict['wb_cat_mask']]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    moddict['wb_cat_ctgc'] = CountVectorizer()
    moddict['wb_cat_ctgc'].fit(merge['general_cat'])
    X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat'])
    moddict['wb_cat_ctsc1'] = CountVectorizer()
    moddict['wb_cat_ctsc1'].fit(merge['subcat_1'])
    X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1'])
    moddict['wb_cat_ctsc2'] = CountVectorizer()
    moddict['wb_cat_ctsc2'].fit(merge['subcat_2'])
    X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text,
                                             extractor=(WordBag, {
                                                 "hash_ngrams":
                                                 2,
                                                 "hash_ngrams_weights":
                                                 [1.0, 0.6],
                                                 "hash_size":
                                                 2**28,
                                                 "norm":
                                                 None,
                                                 "tf":
                                                 'binary',
                                                 "idf":
                                                 None
                                             }),
                                             procs=8)
    moddict['wb_dscr'].dictionary_freeze = True
    X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' +
                                                     merge['item_description'])
    moddict['wb_dscr_mask'] = np.array(np.clip(
        X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1),
                                       dtype=bool)
    X_description = X_description[:, moddict['wb_dscr_mask']]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    moddict['wb_brandname'] = LabelBinarizer(sparse_output=True)
    moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train])
    X_brand = moddict['wb_brandname'].transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True)
    moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train])
    X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id'])
    print('[{}] Label binarize `item_condition_id` completed.'.format(
        time.time() - start_time))

    moddict['wb_shipping'] = LabelBinarizer(sparse_output=True)
    moddict['wb_shipping'].fit(merge['shipping'][:nrow_train])
    X_shipping = moddict['wb_shipping'].transform(merge['shipping'])
    print('[{}] Label binarize `shipping` completed.'.format(time.time() -
                                                             start_time))

    print(
        X_itemcond.shape,
        X_shipping.shape,  #X_dummies.shape, 
        X_description.shape,
        X_brand.shape,
        X_category1.shape,
        X_category2.shape,
        X_category3.shape,
        X_name.shape,
        X_cat.shape,
        x_col.shape)
    sparse_merge = hstack((
        X_itemcond,
        X_shipping,  #X_dummies, 
        X_description,
        X_brand,
        X_category1,
        X_category2,
        X_category3,
        X_name,
        X_cat,
        x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    moddict['all_mask'] = np.array(np.clip(
        sparse_merge.getnnz(axis=0) - 1, 0, 1),
                                   dtype=bool)
    sparse_merge = sparse_merge[:, moddict['all_mask']]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline - 0.0004:
            baseline = score_
        else:
            break
        # 0.41357

    moddict['FMmodel'] = model

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = moddict['FMmodel'].predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
        # 0.44532
        # Full data 0.424681

    predsFM = moddict['FMmodel'].predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    return merge, moddict, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
コード例 #19
0
ファイル: t1.py プロジェクト: alanshu2018/Blazer
#from sklearn.feature_extraction.text import HashingVectorizer
#from sklearn.linear_model import *
#vct= HashingVectorizer()
#clf= SGDRegressor()

import wordbatch
from wordbatch.models import FTRL
from wordbatch.extractors import WordBag
wb= wordbatch.WordBatch(extractor=(WordBag, {"hash_ngrams":2, "hash_ngrams_weights":[0.5, -1.0], "hash_size":2**23, "norm":'l2', "tf":'log', "idf":50.0}))
clf= FTRL(alpha=1.0, beta=1.0, L1=0.00001, L2=1.0, D=2 ** 25, iters=1)

train_texts= ["Cut down a tree with a herring? It can't be done.", "Don't say that word.", "How can we not say the word if you don't tell us what it is?"]
train_labels= [1, 0, 1]
test_texts= ["Wait! I said it! I said it! Ooh! I said it again!"]

values = wb.transform(train_texts)
clf.fit(values, train_labels)
preds= clf.predict(wb.transform(test_texts))
print("values={}".format(values))
print("values={}".format(len(values)))
print("texts={}".format(test_texts))
print("transformed={}".format(wb.transform(test_texts)))
print(preds)
コード例 #20
0
    def join(self):
        threading.Thread.join(self)
        return self._return


batchsize = 10000000
D = 2**20

wb = wordbatch.WordBatch(None,
                         extractor=(WordHash, {
                             "ngram_range": (1, 1),
                             "analyzer": "word",
                             "lowercase": False,
                             "n_features": D,
                             "norm": None,
                             "binary": True
                         }),
                         minibatch_size=batchsize // 80,
                         procs=8,
                         freeze=True,
                         timeout=1800,
                         verbose=0)
clf = FM_FTRL(alpha=0.05,
              beta=0.1,
              L1=0.0,
              L2=0.0,
              D=D,
              alpha_fm=0.02,
              L2_fm=0.0,
              init_fm=0.01,
              weight_fm=1.0,
コード例 #21
0
ファイル: model.py プロジェクト: Konohayui/Kaggle
K_model.fit(X_train_K, Y_train, epochs=4, batch_size=batch_size, verbose=10)
print("Keras model training completed!")

X_train.drop([
    "main_catL", "subcat1L", "subcat2L", "brand_nameL", "seq_product_desc",
    "price_leak"
],
             axis=1)
del X_train_K, raw_text
gc.collect()

cutting(X_train)
to_categorical(X_train)

wb1 = wordbatch.WordBatch(normalize_text, \
                        extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0],
                                             "hash_size": 2 ** 29, "norm": None, "tf": 'binary',
                                             "idf": None,}), procs = 8)
wb1.dictionary_freeze = True
wb1.fit(X_train["name"])
X_train_name = wb1.transform(X_train["name"])
mask1 = np.where(X_train_name.getnnz(axis=0) > 1)[0]
X_train_name = X_train_name[:, mask1]

wb2 = wordbatch.WordBatch(normalize_text, \
                        extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.0, 1.0],
                                             "hash_size": 2 ** 28, "norm": "l2", "tf": 1.0,
                                             "idf": None}), procs = 8)
wb2.dictionary_freeze = True
wb2.fit(X_train["item_description"])
X_train_description = wb2.transform(X_train["item_description"])
mask2 = np.where(X_train_description.getnnz(axis=0) > 1)[0]
コード例 #22
0
                    how='left',
                    on='FileID')
    # Sequence of ProductID
    tmp = log_data.groupby('FileID')['ProductID_le'].apply(list)
    data = pd.merge(data,
                    tmp.to_frame().reset_index(),
                    how='left',
                    on='FileID')
    log_data.drop(['CustomerID_le', 'ProductID_le'], axis=1, inplace=True)
    print('Sequentail lebal encoding completed.')

    wb = wordbatch.WordBatch(list2str,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": "l2",
                                 "tf": 'binary',
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_cust = wb.fit_transform(data['CustomerID_le'])
    X_cust = X_cust[:, np.where(X_cust.getnnz(axis=0) > 1)[0]]
    print('Shape of X_cust: {0}'.format(X_cust.shape))
    del wb
    save_sparse_csr(data_path + '/cust_{0}.npz'.format('v1'), X_cust)
    print('Vectorize `CustomerID` completed.')

    wb = wordbatch.WordBatch(list2str,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
コード例 #23
0
def trainFMFTRL(moddict):

    merge = pd.read_csv(trn_file, sep='\t', encoding='utf-8')
    mergetst = pd.read_csv(tst_file, sep='\t', encoding='utf-8')
    #test = pd.read_csv(tst_file, sep='\t', encoding='utf-8')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', merge.shape)

    dftt = merge[(merge.price < 1.0)]
    merge = merge.drop(merge[(merge.price < 1.0)].index)
    del dftt['price']
    nrow_train = merge.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(merge["price"])
    merge = pd.concat([merge, dftt])
    merge['target'] = np.log1p(merge["price"])
    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(merge[:nrow_train].shape[0]),
                                      random_state=233,
                                      train_size=0.90)
    gc.collect()
    cpuStats()

    merge = prepFMFeatures(merge)
    mergetst = prepFMFeatures(mergetst)
    cpuStats()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    moddict['cross_cols'] = {}
    for i in range(0, len(cross_nm)):
        moddict['cross_cols'][cross_nm[i]] = LabelBinarizer(sparse_output=True)
        moddict['cross_cols'][cross_nm[i]].fit(merge[cross_nm[i]])
        if i == 0:
            x_col = moddict['cross_cols'][cross_nm[i]].transform(
                merge[cross_nm[i]])
        else:
            x_col = hstack(
                (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform(
                    merge[cross_nm[i]])))
        del merge[cross_nm[i]]
    gc.collect()
    cpuStats()
    '''
    Test Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    for i in range(0, len(cross_nm)):
        if i == 0:
            x_coltst = moddict['cross_cols'][cross_nm[i]].transform(
                mergetst[cross_nm[i]])
        else:
            x_coltst = hstack(
                (x_col, moddict['cross_cols'][cross_nm[i]].fit_transform(
                    mergetst[cross_nm[i]])))
        del mergetst[cross_nm[i]]
    gc.collect()
    cpuStats()
    '''
    Hash name
    '''
    moddict['wb_name'] = wordbatch.WordBatch(normalize_text,
                                             extractor=(WordBag, {
                                                 "hash_ngrams":
                                                 2,
                                                 "hash_ngrams_weights":
                                                 [1.5, 1.0],
                                                 "hash_size":
                                                 2**29,
                                                 "norm":
                                                 None,
                                                 "tf":
                                                 'binary',
                                                 "idf":
                                                 None,
                                                 'verbose':
                                                 1,
                                             }),
                                             procs=8)
    moddict['wb_name'].dictionary_freeze = True
    X_name = moddict['wb_name'].fit_transform(merge['name'])
    moddict['wb_name_mask'] = np.where(
        X_name[:nrow_train].getnnz(axis=0) > 0)[0]
    X_name = X_name[:, moddict['wb_name_mask']]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    test Hash name
    '''

    X_name = moddict['wb_name'].transform(mergetst['name'])
    X_name = X_name[:, moddict['wb_name_mask']]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category #2
    '''
    moddict['wb_cat'] = wordbatch.WordBatch(normalize_text,
                                            extractor=(WordBag, {
                                                "hash_ngrams":
                                                2,
                                                "hash_ngrams_weights":
                                                [1.0, 1.0],
                                                "hash_size":
                                                2**20,
                                                "norm":
                                                None,
                                                "tf":
                                                'binary',
                                                "idf":
                                                None,
                                            }),
                                            procs=4)
    moddict['wb_cat'].dictionary_freeze = True
    ### This must be the full dataset
    #cats = merge["category_name"].str.replace('/', ' ').unique()
    moddict['wb_cat'].fit(categories)
    X_cat_tmp = moddict['wb_cat'].transform(categories)
    moddict['wb_cat_dict'] = dict([
        (c, X_cat_tmp.getrow(row))
        for (c, row) in zip(categories.tolist(), range(len(categories)))
    ])
    X_cat = vstack(([
        moddict['wb_cat_dict'][c]
        for c in merge["category_name"].str.replace('/', ' ')
    ]))
    #moddict['wb_cat_mask'] = np.array(np.clip(X_cat[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool)
    moddict['wb_cat_mask'] = np.where(X_cat[:nrow_train].getnnz(axis=0) > 0)[0]
    X_cat = X_cat[:, moddict['wb_cat_mask']]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    moddict['wb_cat_ctgc'] = CountVectorizer()
    moddict['wb_cat_ctgc'].fit(merge['general_cat'])
    X_category1 = moddict['wb_cat_ctgc'].transform(merge['general_cat'])
    moddict['wb_cat_ctsc1'] = CountVectorizer()
    moddict['wb_cat_ctsc1'].fit(merge['subcat_1'])
    X_category2 = moddict['wb_cat_ctsc1'].transform(merge['subcat_1'])
    moddict['wb_cat_ctsc2'] = CountVectorizer()
    moddict['wb_cat_ctsc2'].fit(merge['subcat_2'])
    X_category3 = moddict['wb_cat_ctsc2'].transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    moddict['wb_dscr'] = wordbatch.WordBatch(normalize_text,
                                             extractor=(WordBag, {
                                                 "hash_ngrams":
                                                 2,
                                                 "hash_ngrams_weights":
                                                 [1.0, 0.6],
                                                 "hash_size":
                                                 2**28,
                                                 "norm":
                                                 None,
                                                 "tf":
                                                 'binary',
                                                 "idf":
                                                 None
                                             }),
                                             procs=8)
    moddict['wb_dscr'].dictionary_freeze = True
    X_description = moddict['wb_dscr'].fit_transform(merge['name'] + ' ' +
                                                     merge['item_description'])
    #moddict['wb_dscr_mask'] = np.array(np.clip(X_description[:nrow_train].getnnz(axis=0) - 1, 0, 1), dtype=bool)
    moddict['wb_dscr_mask'] = np.where(
        X_description[:nrow_train].getnnz(axis=0) > 1)[0]
    X_description = X_description[:, moddict['wb_dscr_mask']]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    moddict['wb_brandname'] = LabelBinarizer(sparse_output=True)
    moddict['wb_brandname'].fit(merge['brand_name'][:nrow_train])
    X_brand = moddict['wb_brandname'].transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    moddict['wb_itemcond'] = LabelBinarizer(sparse_output=True)
    moddict['wb_itemcond'].fit(merge['item_condition_id'][:nrow_train])
    X_itemcond = moddict['wb_itemcond'].transform(merge['item_condition_id'])
    print('[{}] Label binarize `item_condition_id` completed.'.format(
        time.time() - start_time))

    moddict['wb_shipping'] = LabelBinarizer(sparse_output=True)
    moddict['wb_shipping'].fit(merge['shipping'][:nrow_train])
    X_shipping = moddict['wb_shipping'].transform(merge['shipping'])
    print('[{}] Label binarize `shipping` completed.'.format(time.time() -
                                                             start_time))

    print(
        X_itemcond.shape,
        X_shipping.shape,  #X_dummies.shape, 
        X_description.shape,
        X_brand.shape,
        X_category1.shape,
        X_category2.shape,
        X_category3.shape,
        X_name.shape,
        X_cat.shape,
        x_col.shape)
    sparse_merge = hstack((
        X_itemcond,
        X_shipping,  #X_dummies, 
        X_description,
        X_brand,
        X_category1,
        X_category2,
        X_category3,
        X_name,
        X_cat,
        x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    print(50 * '-')
    cpuStats()
    print(50 * '-')
    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    gc.collect()
    sparse_merge, y = sparse_merge[:nrow_train], y[:nrow_train]
    if develop:
        train_X, valid_X, train_y, valid_y = sparse_merge[trnidx], \
                                        sparse_merge[validx], \
                                        y.values[trnidx], y.values[validx]
        del sparse_merge
        gc.collect()
    print(50 * '*')
    cpuStats()
    print(50 * '*')
    print(train_X.shape[1])
    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=train_X.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=4)  #iters=15

    print(50 * '|')
    cpuStats()
    print(50 * '|')
    baseline = 1.
    for i in range(15):
        print(50 * '-')
        cpuStats()
        print(50 * '-')
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline - 0.0004:
            baseline = score_
        else:
            break

    moddict['FMmodel'] = model

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = moddict['FMmodel'].predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
    gc.collect()

    return merge, moddict, trnidx, validx, nrow_train, predsfm
コード例 #24
0
class FMFtrlModel(object):
    wb = wordbatch.WordBatch(None, extractor=(WordHash, {"ngram_range": (1, 1), "analyzer": "word",
                                                     "lowercase": False, "n_features": D,
                                                     "norm": None, "binary": True})
                         , minibatch_size=batchsize // 80, procs=8, freeze=True, timeout=1800, verbose=0)

    def __init__(self,config):
        self.config = config
        self._build()

    def _build(self):
        D_fm = self.config['D_fm']
        iters = self.config['iters']
        e_clip = self.config['e_clip']
        alpha_fm = self.config['alpha_fm']
        weight_fm = self.config['weight_fm']
        threads = 8

        clf = FM_FTRL(alpha=0.05, beta=0.1, L1=0.0, L2=0.0, D=D, alpha_fm=alpha_fm, L2_fm=0.0, init_fm=0.01, weight_fm=weight_fm,
              D_fm=D_fm, e_noise=0.0, iters=iters, inv_link="sigmoid", e_clip=e_clip, threads=threads, use_avx=1, verbose=0)
        self.model = clf

    def fit(self,data, y, validate=True, weight=None):
        total_data = len(data)
        p = None
        X = None
        rcount = 0
        start_time = time.time()
        #cpuStats()
        step = 200000
        epochs = int(total_data/step)+1
        for epoch in range(epochs):
            start = epoch * step
            end = start + step
            if start >= total_data:
                break
            if end > total_data:
                end = total_data

            str_array = df2csr(data[start:end])
            labels = y[start:end]
            if weight is not None:
                W = weight[start:end]
            else:
                W = None
            if p != None:
                p.join()
            if X is not None:
                del(X)
                gc.collect()
            X= self.wb.transform(str_array)
            del(str_array)
            gc.collect()
            rcount += step
            if rcount % (2 * step) == 0:
                if p != None:  p.join()
                p = threading.Thread(target=evaluate_batch, args=(self.model, X, labels, rcount))
                p.start()
            print("Training", rcount, time.time() - start_time)
            if p != None:  p.join()
            p = threading.Thread(target=fit_batch, args=(self.model, X, labels, W))
            p.start()
        if p != None:  p.join()
        del(X)
        gc.collect()

    def predict(self,X_train,weight=None):
        p = None
        test_preds = []
        click_ids = []
        str_array  = df2csr(X_train)
        del(X_train)
        gc.collect()
        X = self.wb.transform(str_array)
        del(str_array)
        gc.collect()
        p = ThreadWithReturnValue(target=predict_batch, args=(self.model, X))
        p.start()

        if p != None:  test_preds += list(p.join())
        del(X)
        gc.collect()
        return test_preds
コード例 #25
0
# Imprime cantiadad de noticias
print("Listo, cantidad de noticias....")
print(corpus.shape)

# Calculo TF-IDF
n_docs = len(corpus['Cuerpo'].tolist())
n_cpu = 2
batch_size = int(n_docs/n_cpu)
_n_words = 500

extractor=(WordBag, {"hash_ngrams": 1, "hash_ngrams_weights": [1.0, 1.0],\
                     "hash_size": 2**22, "norm": "l2", "tf": 1.0,"idf": 1.0})

wb = wordbatch.WordBatch(normalize_text,\
                         extractor= extractor,\
                         procs= n_cpu,\
                         minibatch_size= batch_size)

#WORBBAG_ITEM_DESC_PARAMS = {'hash_ngrams': 2, 'hash_ngrams_weights': [1.0, 1.0],
#                            'hash_size': 2 ** 26, 'norm': 'l2', 'tf': 1.0, 'idf': None}
#wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, WORBBAG_ITEM_DESC_PARAMS),\
#                         procs= n_cpu)

#wb = wordbatch.WordBatch(normalize_text,\
#                         extractor= extractor, procs = n_cpu )
                         
#procs= n_cpu, n_words= 500, minibatch_size= batch_size)
#wb.use_sc = True
wb.dictionary_freeze = True
# b = Batcher(procs=n_cpu, minibatch_size=batch_size, use_sc=True)0
# lista = pd.DataFrame([corpus['Cuerpo'].tolist()])
コード例 #26
0
def Split_Train_Test_FTRL(merge: pd.DataFrame, hand_feature, start_time):
    desc_w1 = param_space_best_WordBatch['desc_w1']
    desc_w2 = param_space_best_WordBatch['desc_w2']
    name_w1 = param_space_best_WordBatch['name_w1']
    name_w2 = param_space_best_WordBatch['name_w2']

    merge['brand_name'] = inductive_brand(merge[['brand_name', 'name']])

    wb = wordbatch.WordBatch(normalize_text=None,
                             extractor=(WordBag, {
                                 "hash_ngrams":
                                 2,
                                 "hash_ngrams_weights": [name_w1, name_w2],
                                 "hash_size":
                                 2**28,
                                 "norm":
                                 None,
                                 "tf":
                                 'binary',
                                 "idf":
                                 None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name']).astype(np.float32)
    del wb
    merge.drop(['name'], axis=1, inplace=True)
    X_name = X_name[:,
                    np.array(np.clip(X_name[:TRAIN_SIZE].getnnz(axis=0) -
                                     1, 0, 1),
                             dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = wordbatch.WordBatch(normalize_text=None,
                             extractor=(WordBag, {
                                 "hash_ngrams":
                                 2,
                                 "hash_ngrams_weights": [desc_w1, desc_w2],
                                 "hash_size":
                                 2**28,
                                 "norm":
                                 "l2",
                                 "tf":
                                 1.0,
                                 "idf":
                                 None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description_train = wb.fit_transform(
        merge['item_description'][:TRAIN_SIZE]).astype(np.float32)
    mask = np.array(np.clip(X_description_train.getnnz(axis=0) - 1, 0, 1),
                    dtype=bool)
    X_description_train = X_description_train[:, mask]
    print('X_description_train done')
    valid_len = merge.shape[0] - TRAIN_SIZE
    valid_len1, valid_len2 = int(valid_len / 3), int(valid_len * 2 / 3)
    X_description_test1 = wb.fit_transform(
        merge['item_description'][TRAIN_SIZE:TRAIN_SIZE + valid_len1]).astype(
            np.float32)
    X_description_test1 = X_description_test1[:, mask]
    print('X_description_test1 done')
    X_description_test2 = wb.fit_transform(
        merge['item_description'][TRAIN_SIZE + valid_len1:TRAIN_SIZE +
                                  valid_len2]).astype(np.float32)
    X_description_test2 = X_description_test2[:, mask]
    print('X_description_test2 done')
    X_description_test3 = wb.fit_transform(
        merge['item_description'][TRAIN_SIZE + valid_len2:]).astype(np.float32)
    X_description_test3 = X_description_test3[:, mask]
    print('X_description_test3 done')
    del wb, mask
    merge.drop(['item_description'], axis=1, inplace=True)
    print(X_description_train.shape, X_description_test1.shape,
          X_description_test2.shape, X_description_test3.shape)
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    X_category1, X_category2, X_category3, X_brand = Get_Vectorizor(merge)
    merge.drop(['category_1', 'category_2', 'category_name', 'brand_name'],
               axis=1,
               inplace=True)
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values.astype(np.float32))
    merge.drop(['item_condition_id', 'shipping'], axis=1, inplace=True)
    X_hand_feature = merge[hand_feature].values.astype(np.float32)
    merge.drop(hand_feature, axis=1, inplace=True)
    print('-' * 50)

    # coo_matrix
    X_train = hstack((X_dummies[:TRAIN_SIZE], X_brand[:TRAIN_SIZE],
                      X_category1[:TRAIN_SIZE], X_category2[:TRAIN_SIZE],
                      X_category3[:TRAIN_SIZE], X_hand_feature[:TRAIN_SIZE],
                      X_name[:TRAIN_SIZE], X_description_train),
                     dtype=np.float32)
    print(X_description_train.shape)
    X_description_train = None
    gc.collect()
    print('-' * 50)
    X_test1 = hstack(
        (X_dummies[TRAIN_SIZE:TRAIN_SIZE + valid_len1],
         X_brand[TRAIN_SIZE:TRAIN_SIZE + valid_len1],
         X_category1[TRAIN_SIZE:TRAIN_SIZE + valid_len1],
         X_category2[TRAIN_SIZE:TRAIN_SIZE + valid_len1],
         X_category3[TRAIN_SIZE:TRAIN_SIZE + valid_len1],
         X_hand_feature[TRAIN_SIZE:TRAIN_SIZE + valid_len1],
         X_name[TRAIN_SIZE:TRAIN_SIZE + valid_len1], X_description_test1),
        dtype=np.float32)
    X_description_test1 = None
    gc.collect()
    print('-' * 50)
    X_test2 = hstack(
        (X_dummies[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2],
         X_brand[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2],
         X_category1[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2],
         X_category2[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2],
         X_category3[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2],
         X_hand_feature[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2],
         X_name[TRAIN_SIZE + valid_len1:TRAIN_SIZE + valid_len2],
         X_description_test2),
        dtype=np.float32)
    X_description_test2 = None
    gc.collect()
    print('-' * 50)
    X_test3 = hstack((X_dummies[TRAIN_SIZE + valid_len2:],
                      X_brand[TRAIN_SIZE + valid_len2:],
                      X_category1[TRAIN_SIZE + valid_len2:],
                      X_category2[TRAIN_SIZE + valid_len2:],
                      X_category3[TRAIN_SIZE + valid_len2:],
                      X_hand_feature[TRAIN_SIZE + valid_len2:],
                      X_name[TRAIN_SIZE + valid_len2:], X_description_test3),
                     dtype=np.float32)
    X_description_test3 = None
    gc.collect()

    print(X_dummies.shape, X_brand.shape, X_category1.shape, X_category2.shape,
          X_category3.shape, X_hand_feature.shape, X_name.shape, X_train.shape,
          X_test1.shape, X_test2.shape, X_test3.shape)
    X_dummies, X_brand, X_category1, X_category2, X_category3, X_hand_feature, X_name = None, None, None, None, None, None, None
    gc.collect()

    # csr_matrix
    X_train = X_train.tocsr()
    print('[{}] X_train completed.'.format(time.time() - start_time))
    X_test1 = X_test1.tocsr()
    print('[{}] X_test1 completed.'.format(time.time() - start_time))
    X_test2 = X_test2.tocsr()
    print('[{}] X_test2 completed.'.format(time.time() - start_time))
    X_test3 = X_test3.tocsr()
    print('[{}] X_test3 completed.'.format(time.time() - start_time))
    return X_train, X_test1, X_test2, X_test3
コード例 #27
0
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')
    glove_file = '../feat/glove.6B.50d.txt'
    threads = 4
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    ix = (merge['brand_name'] == merge['brand_name']) & (
        ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(
            merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix]

    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))
    '''
    Crossed columns
    '''

    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns

    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
        ['brand_name', 'item_condition_id_str'],
        ['brand_name', 'subcat_1'],
        ['brand_name', 'subcat_2'],
        ['brand_name', 'general_cat'],
        #['brand_name',  'subcat_1',  'item_condition_id_str'],
        #['brand_name',  'subcat_2',  'item_condition_id_str'],
        #['brand_name',  'general_cat',  'item_condition_id_str'],
        ['brand_name', 'shipping_str'],
        ['shipping_str', 'item_condition_id_str'],
        ['shipping_str', 'subcat_2'],
        ['item_condition_id_str', 'subcat_2'])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(merge.select_dtypes(include=['object']).columns)

    D = 2**30
    for k, v in crossed_columns_d.items():
        print('Crossed column ', k)
        outls_ = []
        indicator = 0
        for col in v:
            outls_.append((np.array(merge[col].apply(hash))) % D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del (lb)
    '''
    Hash name
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**20,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del (wb)
    X_cat = X_cat[:,
                  np.array(np.clip(X_cat.getnnz(axis=0) -
                                   1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**29,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape)
    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_cat, x_col)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    print(sparse_merge.shape)
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    mask = np.array(np.clip(X.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    X = X[:, mask]
    X_test = X_test[:, mask]
    print(X.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline:
            baseline = score_
        else:
            break

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
        # 0.44532
        # Full data 0.424681

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
コード例 #28
0
    # normalize_text= default_normalize_text, spellcor_count=0, spellcor_dist= 2, n_words= 10000000,
    # min_df= 0, max_df= 1.0, raw_min_df= -1, procs= 0, verbose= 1, minibatch_size= 20000,

    vectorizer = HashingVectorizer(preprocessor=normalize_text,
                                   decode_error='ignore',
                                   n_features=2**23,
                                   non_negative=False,
                                   ngram_range=(1, 2),
                                   norm='l2')
    start = time.time()
    X = vectorizer.fit_transform(df['text_normalized'])
    print("Process time: {}".format(time.time() - start))
    print(X.shape)

    start = time.time()
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordHash, {
                                 "decode_error": 'ignore',
                                 "n_features": 2**23,
                                 "non_negative": False,
                                 "ngram_range": (1, 2),
                                 "norm": 'l2'
                             }),
                             procs=8
                             #, method="serial"
                             )

    Xwb = wb.fit_transform(df['text_normalized'].values)
    print("Process time: {}".format(time.time() - start))
    print(Xwb.shape)
コード例 #29
0
def wordbatch_algo():
    import time

    # print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))
    train = pd.read_table('../input/train.tsv', engine='c')
    # Drop rows where price = 0
    train = train[train.price != 0].reset_index(drop=True)
    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)

    y = np.log1p(train["price"])

    nrow_train = train.shape[0]

    # Training
    train['general_cat'], train['subcat_1'], train['subcat_2'] = \
        zip(*train['category_name'].apply(lambda x: split_cat(x)))
    train.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(train)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(train)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(train)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    # Add some new features:
    X_len_desc = train['item_description'].apply(
        lambda x: len(x)).as_matrix().reshape(-1, 1)
    X_len_name = train['name'].apply(lambda x: len(x)).as_matrix().reshape(
        -1, 1)

    print('[{}] Length of text completed.'.format(time.time() - start_time))

    # Name
    wb_name = wordbatch.WordBatch(normalize_text,
                                  extractor=(WordBag, {
                                      "hash_ngrams": 2,
                                      "hash_ngrams_weights": [1.5, 1.0],
                                      "hash_size": 2**29,
                                      "norm": None,
                                      "tf": 'binary',
                                      "idf": None,
                                  }),
                                  procs=8)

    wb_name.dictionary_freeze = True
    wb_name.fit(train['name'])
    X_name = wb_name.transform(train['name'])

    # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb_cat1 = CountVectorizer()
    wb_cat2 = CountVectorizer()
    wb_cat3 = CountVectorizer()
    wb_cat1.fit(train['general_cat'])
    wb_cat2.fit(train['subcat_1'])
    wb_cat3.fit(train['subcat_2'])

    X_category1 = wb_cat1.transform(train['general_cat'])
    X_category2 = wb_cat2.transform(train['subcat_1'])
    X_category3 = wb_cat3.transform(train['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb_desc = wordbatch.WordBatch(normalize_text,
                                  extractor=(WordBag, {
                                      "hash_ngrams": 2,
                                      "hash_ngrams_weights": [1.0, 1.0],
                                      "hash_size": 2**28,
                                      "norm": "l2",
                                      "tf": 1.0,
                                      "idf": None
                                  }),
                                  procs=8)
    wb_desc.dictionary_freeze = True
    wb_desc.fit(train['item_description'])
    X_description = wb_desc.transform(train['item_description'])

    # X_description = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    lb.fit(train['brand_name'])
    X_brand = lb.transform(train['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_cond, d_cond = fit_dummy(train['item_condition_id'].tolist())
    X_ship, d_ship = fit_dummy(train['shipping'].tolist())

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))

    del train
    gc.collect()

    print(X_cond.shape, X_ship.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_cond, X_ship, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))
    del X_description, X_brand, X_category1, X_category2, X_category3, X_name
    gc.collect()

    # Remove features with document frequency <=1

    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    print(sparse_merge.shape)
    X = sparse_merge

    # ---------------------------------------
    # FM model fit
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(
            X, y, test_size=TEST_SIZE, random_state=SPLIT_SEED)

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=train_X.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=FM_iter,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train FM_FTRL completed'.format(time.time() - start_time))
    print('-' * 20)
    if develop:
        preds = model.predict(X=valid_X)
        print("->>>>  FM_FTRL dev RMSLE:",
              rmsle(np.expm1(valid_y), np.expm1(preds)))

    # ---------------------------------------
    # FTRL model fit
    model2 = FTRL(alpha=0.01,
                  beta=0.01,
                  L1=0.00001,
                  L2=1.0,
                  D=train_X.shape[1],
                  iters=FTRL_iter,
                  inv_link="identity",
                  threads=1)
    # del X; gc.collect()
    model2.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model2.predict(X=valid_X)
        print("->>>>  FTRL dev RMSLE:",
              rmsle(np.expm1(valid_y), np.expm1(preds)))

    # Clear variables:
    del X, train_X, train_y, sparse_merge
    gc.collect()

    # ---------------------------------------
    # Testing by chunk
    print(' FM/FTRL: ...reading the test data...')
    predsFM = []
    predsF = []

    for test in load_test():
        test['general_cat'], test['subcat_1'], test['subcat_2'] = \
            zip(*test['category_name'].apply(lambda x: split_cat(x)))
        test.drop('category_name', axis=1, inplace=True)

        handle_missing_inplace(test)
        #print('[{}] Handle missing completed.'.format(time.time() - start_time))

        cutting(test)
        # print('[{}] Cut completed.'.format(time.time() - start_time))

        to_categorical(test)
        # print('[{}] Convert categorical completed'.format(time.time() - start_time))

        # Add some new features:
        X_len_desc_test = test['item_description'].apply(
            lambda x: len(x)).as_matrix().reshape(-1, 1)
        X_len_name_test = test['name'].apply(
            lambda x: len(x)).as_matrix().reshape(-1, 1)

        X_name_test = wb_name.transform(test['name'])
        # X_name = X_name[:, np.array(np.clip(X_name.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

        X_category1_test = wb_cat1.transform(test['general_cat'])
        X_category2_test = wb_cat2.transform(test['subcat_1'])
        X_category3_test = wb_cat3.transform(test['subcat_2'])

        X_description_test = wb_desc.transform(test['item_description'])
        # X_description_test = X_description[:, np.array(np.clip(X_description.getnnz(axis=0) - 1, 0, 1), dtype=bool)]

        X_brand_test = lb.transform(test['brand_name'])

        X_cond_test = transform_dummy(test['item_condition_id'].tolist(),
                                      d_cond)
        X_ship_test = transform_dummy(test['shipping'].tolist(), d_ship)


        X_test = hstack((X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, \
                         X_category2_test, X_category3_test, X_name_test)).tocsr()
        X_test = X_test[:, mask]

        # Clear variables:
        del X_cond_test, X_ship_test, X_description_test, X_brand_test, X_category1_test, X_category2_test, X_category3_test, X_name_test
        del test
        gc.collect()

        predsFM_batch = model.predict(X_test)
        predsFM += np.array(predsFM_batch).flatten().tolist()

        predsF_batch = model2.predict(X_test)
        predsF += np.array(predsF_batch).flatten().tolist()

    print(np.array(predsFM))
    print('-' * 20)

    print(np.array(predsF))
    print('-' * 20)

    return np.array(predsFM), np.array(predsF)
コード例 #30
0
def main():
    start_time = time.time()
    from time import gmtime, strftime
    print(strftime("%Y-%m-%d %H:%M:%S", gmtime()))

    # if 1 == 1:
    ###train = pd.read_table('../input/mercari-price-suggestion-challenge/train.tsv', engine='c')
    ###test = pd.read_table('../input/mercari-price-suggestion-challenge/test.tsv', engine='c')

    train = pd.read_table('../input/train.tsv', engine='c')
    test = pd.read_table('../input/test.tsv', engine='c')

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]
    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    submission = test[['test_id']]

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)
    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape)
    sparse_merge = hstack((X_dummies, X_description, X_brand, X_category1,
                           X_category2, X_category3, X_name)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    #    pd.to_pickle((sparse_merge, y), "xy.pkl")
    # else:
    #    nrow_train, nrow_test= 1481661, 1482535
    #    sparse_merge, y = pd.read_pickle("xy.pkl")

    # Remove features with document frequency <=1
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 1, 0, 1), dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    model = FTRL(alpha=0.01,
                 beta=0.1,
                 L1=0.00001,
                 L2=1.0,
                 D=sparse_merge.shape[1],
                 iters=50,
                 inv_link="identity",
                 threads=1)

    model.fit(train_X, train_y)
    print('[{}] Train FTRL completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsF = model.predict(X_test)
    print('[{}] Predict FTRL completed'.format(time.time() - start_time))

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=15,
                    inv_link="identity",
                    threads=4)

    model.fit(train_X, train_y)
    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        preds = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    params = {
        'learning_rate': 0.6,
        'application': 'regression',
        'max_depth': 4,
        'num_leaves': 31,
        'verbosity': -1,
        'metric': 'RMSE',
        'data_random_seed': 1,
        'bagging_fraction': 0.6,
        'bagging_freq': 5,
        'feature_fraction': 0.6,
        'nthread': 4,
        'min_data_in_leaf': 100,
        'max_bin': 31
    }

    # Remove features with document frequency <=100
    print(sparse_merge.shape)
    mask = np.array(np.clip(sparse_merge.getnnz(axis=0) - 100, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    train_X, train_y = X, y
    if develop:
        train_X, valid_X, train_y, valid_y = train_test_split(X,
                                                              y,
                                                              test_size=0.05,
                                                              random_state=100)

    d_train = lgb.Dataset(train_X, label=train_y)
    watchlist = [d_train]
    if develop:
        d_valid = lgb.Dataset(valid_X, label=valid_y)
        watchlist = [d_train, d_valid]

    model = lgb.train(params, train_set=d_train, num_boost_round=6000, valid_sets=watchlist, \
                      early_stopping_rounds=1000, verbose_eval=1000)

    if develop:
        preds = model.predict(valid_X)
        print("LGB dev RMSLE:", rmsle(np.expm1(valid_y), np.expm1(preds)))

    predsL = model.predict(X_test)

    print('[{}] Predict LGB completed.'.format(time.time() - start_time))

    preds = (predsF * 0.2 + predsL * 0.3 + predsFM * 0.5)

    submission['price'] = np.expm1(preds)
    submission.to_csv("submission_wordbatch_ftrl_fm_lgb.csv", index=False)