Exemple #1
0
def use_pyfasttext_model():
    # OK
    # 训练模型可以使用fasttext命令行工具进行(../doc/fastText_train.png),也可以使用本文件使用的pyfasttext包训练。
    """
    # OK: 1. pyfasttext包训练的模型的导入
    model = FastText("../data/lxw_model_sg_pyfasttext.bin")
    print(model["先生"])     # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))

    model = FastText("../data/lxw_model_cbow_pyfasttext.bin")
    print(model["先生"])
    print(model.get_numpy_vector("先生"))    # type: <class 'numpy.ndarray'>
    print(model["刘晓伟"])   # OOV
    print(model.get_numpy_vector("刘晓伟"))
    print(model["陈贺"])   # OOV
    print(model.get_numpy_vector("陈贺"))
    # NOTE: 简单的测试发现, 两个不同的模型针对同一个OOV计算得到的向量是一样的(与fasttext包的情况相同,详情可参见NO_2_use_fasttext_model), 非OOV的向量是不一样的。
    """

    # OK: 2. fasttext命令行工具训练出来的模型的导入
    model = FastText("../data/880w_fasttext_skip_gram.bin")
    print(model["先生"])  # type(model["先生"]): <class 'array.array'>
    print(model.get_numpy_vector("先生"))
    # print(model["刘晓伟"])   # OK. OOV
    # print(model["陈贺"])   # OK. OOV

    # Sentence and text vectors.
    sentence_vec = model.get_numpy_sentence_vector("刘晓伟 是 个 好人")
    print(sentence_vec)
    """
class NB_Implement():
    def __init__(self):
        start_time = time.time()
        # self.model = FastText("../data/input/models/sg_pyfasttext.bin")  # DEBUG
        self.model = FastText(
            "../data/input/models/880w_fasttext_skip_gram.bin")
        end_time = time.time()
        print(f"Loading word vector model cost: {end_time - start_time:.2f}s")

        # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape  # OK
        self.vocab_size = self.model.nwords
        self.vector_size = self.model.args.get("dim")
        print(
            f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}"
        )  # self.vector_size:200, self.vocab_size: 925242

        # 句子的表示形式: {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "matrix": matrix}
        self.sentence_vec_type = "avg"

    def set_sent_vec_type(self, sentence_vec_type):
        assert self.sentence_vec_type in [
            "avg", "matrix", "fasttext"
        ], "self.sentence_vec_type must be in ['avg', 'fasttext', 'matrix']"
        self.sentence_vec_type = sentence_vec_type

    def gen_sentence_vec(self, sentence):
        """
        :param sentence: 
        :return: 
        """
        sentence = sentence.strip()
        if self.sentence_vec_type == "fasttext":
            return self.model.get_numpy_sentence_vector(sentence)

        word_list = [word for word in sentence.split(" ")]
        word_len = len(word_list)
        if self.sentence_vec_type == "matrix":
            sentence_matrix = np.empty(word_len, dtype=list)
            for idx, word in enumerate(word_list):
                sentence_matrix[idx] = self.model.get_numpy_vector(word)
            return sentence_matrix
        else:  # self.sentence_vec_type == "avg":
            sentence_vector = np.zeros(self.vector_size)  # <ndarray>
            # print(f"type(sentence_vector): {type(sentence_vector)}")
            for idx, word in enumerate(word_list):
                # print(f"type(self.model.get_numpy_vector(word)): {type(self.model.get_numpy_vector(word))}")  # <ndarry>
                sentence_vector += self.model.get_numpy_vector(word)
            return sentence_vector / len(word_list)

    def gen_train_val_data(self):
        """
        构造训练, 验证数据
        """
        X_train = list()
        y_train = list()
        for line in open("../data/input/training_set.txt"):
            line = line.strip().split("\t")
            sent_vector = self.gen_sentence_vec(line[-1])
            X_train.append(sent_vector)
            y_train.append(int(line[0]))

        X_val = list()
        y_val = list()
        for line in open("../data/input/validation_set.txt"):
            line = line.strip().split("\t")
            sent_vector = self.gen_sentence_vec(line[-1])
            X_val.append(sent_vector)
            y_val.append(int(line[0]))

        return np.array(X_train), np.array(y_train), np.array(X_val), np.array(
            y_val),

    def train_bayes(self, X_train, y_train):
        """
        基于Naive Bayes的分类算法
        """
        from sklearn.naive_bayes import GaussianNB
        model = GaussianNB()
        model.fit(X_train, y_train)
        joblib.dump(model, "../data/output/models/bayes_model")

    def evaluate_bayes(self, model_path, X_val, y_val):
        """
        基于Naive Bayes分类器的预测
        """
        model = joblib.load(model_path)
        y_val = list(y_val)
        correct = 0
        """
        y_predict = list()
        for sent_vec in X_val:  # sent_vec.shape: (self.vector_size,)
            predicted = model.predict(sent_vec.reshape(1, -1))  # sent_vec.reshape(1, -1).shape: (1, self.vector_size)
            y_predict.append(predicted[0])
        """
        y_predict = model.predict(X_val)
        print(f"len(y_predict): {len(y_predict)}, len(y_val): {len(y_val)}")
        assert len(y_predict) == len(
            y_val
        ), "Unexpected Error: len(y_predict) != len(y_val), but it should be"
        for idx in range(len(y_predict)):
            if int(y_predict[idx]) == int(y_val[idx]):
                correct += 1
        score = correct / len(y_predict)
        print(f"Bayes Classification Accuray:{score}")
        return score

    def predict_bayes(self, model_path):
        """
        实际应用测试
        """
        model = joblib.load(model_path)
        sentence = "这件 衣服 真的 太 好看 了 ! 好想 买 啊 "
        sent_vec = np.array(self.gen_sentence_vec(sentence)).reshape(1, -1)
        print(f"'{sentence}': {model.predict(sent_vec)}")  # 1: 负向

        sentence = "这个 电视 真 尼玛 垃圾 , 老子 再也 不买 了"
        sent_vec = np.array(self.gen_sentence_vec(sentence)).reshape(1, -1)
        print(f"'{sentence}': {model.predict(sent_vec)}")  # 1: 负向
Exemple #3
0
print('[{}] Start fasttext training'.format(time.time() - start_time))
model = fasttext.cbow('ftext_name.txt',
                      'model',
                      dim=24,
                      ws=4,
                      lr=.05,
                      min_count=1,
                      thread=8,
                      epoch=4,
                      silent=0)
modelcb = FastText('model.bin')
print('[{}] Start fasttext mat creation'.format(time.time() - start_time))

ftmat = np.zeros((merge.shape[0], 24))
for c, vals in tqdm(enumerate(merge[['category_name', 'name']].values)):
    ftmat[c] = modelcb.get_numpy_sentence_vector(
        '%s %s' % (vals[0].replace('/', ' '), vals[1]))
ftmat = pd.DataFrame(ftmat)
print('[{}] Finished fasttext mat creation'.format(time.time() - start_time))
ftmat.head()
'''
def make_weights(dict_):
    embmat = np.zeros((len(dict_.keys())+1, 24))
    for k, v in dict_.items():
        embmat[v] = modelcb[k]
    return embmat

embmatcat = make_weights(tok_raw_cat)
embmatnam = make_weights(tok_raw_nam)
embmatdsc = make_weights(tok_raw_dsc)
embmatntk = make_weights(tok_raw_ntk)
'''
Exemple #4
0
class Title2Rec(AbstractRecommender):

    def __init__(self, dataset=False, dry=True, w2r_model_file=None, pl_model_file=None, ft_model_file=None,
                 ft_vec_file=None, cluster_file=None, num_clusters=100, fallback=MostPopular, rnn=False):
        super().__init__(dataset, dry=dry)

        if rnn:
            self.init_light(ft_model_file)
            return

        print('Import playlists')
        self.playlists = self.dataset.reader(self.train_playlists, self.train_items)
        self.playlists = np.array(list(filter(lambda p: p['title'] and len(p['items']) > 0, self.playlists)))

        self.fallback = fallback(dataset, dry=dry)

        if os.path.isfile(ft_model_file):
            self.ft_model = FastText(ft_model_file)
        else:
            print('***Full init started***')
            self.num_clusters = num_clusters
            print('- Import w2r models')
            self.w2r_model = self.get_w2r(dataset, dry, w2r_model_file)
            print('- Compute playlists embeddings')
            self.pl_embs = self.compute_pl_embs(pl_model_file)
            print('- Cluster the playlist')
            self.clusters = self.compute_clusters(cluster_file, self.pl_embs, num_clusters)
            print('- Fast_text on the clusters')
            self.ft_model = self.compute_fasttext(ft_model_file)
            print('***Full init end***')

        if not os.path.isfile(ft_vec_file):
            self.title_vecs = [self.get_title_vector_from_playlist(pl) for pl in self.playlists]
            with open(ft_vec_file, 'w') as file_handler:
                file_handler.write('%d %d\n' % (len(self.title_vecs), len(self.title_vecs[0])))
                for idx, vec in enumerate(self.title_vecs):
                    file_handler.write('%d %s\n' % (idx, ' '.join(vec.astype(np.str))))

        self.pl_vec = KeyedVectors.load_word2vec_format(ft_vec_file, binary=False)

        # nltk.download('stopwords')

    def init_light(self, ft_model_file):
        self.ft_model = FastText(ft_model_file)

    def get_w2r(self, dataset, dry, model_file):
        if os.path.isfile(model_file):
            # Load the model
            model = Word2Vec.load(model_file)
            return model.wv
        else:
            # Train the model
            w2r = Word2Rec(dataset, dry=dry, model_file=model_file, mode=sentence.Mode.ITEM)
            return w2r.model

    def compute_fasttext(self, ft_model_file):
        ft_model_file = ft_model_file.replace('.bin', '')
        descr_keywords = self.compute_tfidf_descr()

        documents = []
        for i in np.arange(self.num_clusters):
            involved_pl = np.array(self.playlists)[self.clusters == i]
            # take only collaboratives
            involved_pl = list(filter(lambda pl: pl['collaborative'], involved_pl))

            titles = [process_title(pl['title']).strip() for pl in involved_pl]
            descr = [' '.join(descr_keywords[pl['pid']]) if pl['pid'] in descr_keywords else ''
                     for pl in involved_pl]
            documents.append(titles + descr)

        doc_file = 'models/documents.txt'
        np.savetxt(doc_file, [' '.join(d) for d in documents], fmt='%s')
        model = FastText()
        model.skipgram(input=doc_file, output=ft_model_file, epoch=100, lr=0.1)
        os.remove(doc_file)

        return model

    def compute_tfidf_descr(self):
        description = dict()

        with open('data/playlists_descr.csv', 'r', newline='') as descr_file:
            description_reader = csv.reader(descr_file)
            for d in description_reader:
                text = d[1]
                if "soundiiz" not in text:
                    description[d[0]] = text

        tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 1), min_df=0, stop_words='english')
        corpus = [process_description(v) for k, v in description.items()]

        tfidf_matrix = tf.fit_transform(corpus)
        feature_names = tf.get_feature_names()

        dense = tfidf_matrix.todense()
        keywords = dict()
        for i, k in enumerate(list(description.keys())):
            keywords[k] = get_descr_keywords(i, 3, dense, feature_names)
        return keywords

    def compute_pl_embs(self, pl_model_file):
        if os.path.isfile(pl_model_file):
            # Load the model
            return np.loadtxt(pl_model_file)
        else:
            # Train the model
            _embs = [self.get_vector_from_w2r(playlist) for playlist in self.playlists]
            np.savetxt(pl_model_file, _embs)
            return _embs

    def get_vector_from_w2r(self, playlist):
        _item_embs = list(map(lambda track_id: self.w2r_model[str(track_id)], playlist['items']))
        return np.array(_item_embs).mean(axis=0)

    def compute_clusters(self, cluster_file, pl_embs, num_clusters=100):
        if os.path.isfile(cluster_file):
            return np.loadtxt(cluster_file)
        else:
            # from scipy.cluster.hierarchy import linkage, fcluster
            # Z = linkage(pl_embs, 'ward')
            # clusters = fcluster(Z, num_clusters, criterion='maxclust')

            kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(pl_embs)
            clusters = kmeans.predict(pl_embs)
            np.savetxt(cluster_file, clusters, fmt="%u")
            return clusters

    def get_vector_from_title(self, title):
        return self.ft_model.get_numpy_sentence_vector(process_title(title).strip())

    def get_title_vector_from_playlist(self, playlist):
        return self.get_vector_from_title(playlist['title'])

    def recommend(self, playlist, n=500, n_pl=300):
        if not playlist['title']:
            return self.fallback.recommend(playlist)

        this_vec = self.get_title_vector_from_playlist(playlist)
        seeds = playlist['items']

        # get more popular tracks among the 100 most similar playlists
        most_similar_vec = self.pl_vec.most_similar(positive=[this_vec], topn=n_pl)
        most_similar_pl = self.playlists[[int(v[0]) for v in most_similar_vec]]
        weights = [v[1] for v in most_similar_vec]
        # prioritize collaborative playlists
        # weights = np.multiply(weights, [1.3 if pl['collaborative'] else 1.0 for pl in most_similar_pl])
        # prioritize edited playlists
        # weights = np.multiply(weights, [0.9 + pl['num_edits'] / 10 for pl in most_similar_pl])

        predictions_and_seeds = [pl['items'] for pl in most_similar_pl]
        playlist['items'] = count_and_weights(predictions_and_seeds, seeds, weights)[0:n]
def getFMFTRL():
    #os.chdir('/Users/dhanley2/Documents/mercari/data')
    os.chdir('/home/darragh/mercari/data')
    train = pd.read_csv('../data/train.tsv', sep='\t', encoding='utf-8')
    test = pd.read_csv('../data/test.tsv', sep='\t', encoding='utf-8')
    glove_file = '../feat/glove.6B.50d.txt'
    threads = 4
    save_dir = '../feat'

    print('[{}] Finished to load data'.format(time.time() - start_time))
    print('Train shape: ', train.shape)
    print('Test shape: ', test.shape)
    nrow_test = train.shape[0]  # -dftt.shape[0]

    dftt = train[(train.price < 1.0)]
    train = train.drop(train[(train.price < 1.0)].index)
    del dftt['price']
    nrow_train = train.shape[0]
    # print(nrow_train, nrow_test)
    y = np.log1p(train["price"])
    merge = pd.concat([train, dftt, test])
    merge['target'] = np.log1p(merge["price"])
    submission = test[['test_id']]
    ix = (merge['brand_name'] == merge['brand_name']) & (
        ~merge['brand_name'].str.lower().fillna('ZZZZZZ').isin(
            merge['name'].str.lower()))
    merge['name'][ix] = merge['brand_name'][ix] + ' ' + merge['name'][ix]

    #EXTRACT DEVELOPTMENT TEST
    trnidx, validx = train_test_split(range(train.shape[0]),
                                      random_state=233,
                                      train_size=0.90)

    del train
    del test
    gc.collect()

    merge['general_cat'], merge['subcat_1'], merge['subcat_2'] = \
        zip(*merge['category_name'].apply(lambda x: split_cat(x)))
    #merge.drop('category_name', axis=1, inplace=True)
    print('[{}] Split categories completed.'.format(time.time() - start_time))

    handle_missing_inplace(merge)
    print('[{}] Handle missing completed.'.format(time.time() - start_time))

    cutting(merge)
    print('[{}] Cut completed.'.format(time.time() - start_time))

    to_categorical(merge)
    print('[{}] Convert categorical completed'.format(time.time() -
                                                      start_time))
    '''
    fasttext mats
    '''
    from pyfasttext import FastText
    import fasttext
    from tqdm import tqdm
    fonm = open('ftext_name.txt', 'w')
    for nm, ct, ds in zip(merge.name_token.str.lower(),
                          merge.category_name_split.str.lower(),
                          merge.description_token.str.lower()):
        fonm.write('%s %s %s\n' % (ct.encode(
            'ascii', 'ignore').lower(), ds.encode('ascii', 'ignore').lower(),
                                   nm.encode('ascii', 'ignore').lower()))
    fonm.close()

    print('[{}] Start fasttext training'.format(time.time() - start_time))
    model = fasttext.cbow('ftext_name.txt',
                          'model',
                          dim=24,
                          ws=4,
                          lr=.05,
                          min_count=1,
                          thread=8,
                          epoch=4,
                          silent=0)
    modelcb = FastText('model.bin')
    print('[{}] Start fasttext mat creation'.format(time.time() - start_time))

    ftmat = np.zeros((merge.shape[0], 24))
    for c, vals in tqdm(enumerate(merge[['category_name', 'name']].values)):
        ftmat[c] = modelcb.get_numpy_sentence_vector(
            '%s %s' % (vals[0].replace('/', ' '), vals[1]))
    ftmat = coo_matrix(ftmat)
    #ftmat = pd.DataFrame(ftmat)
    #print('[{}] Finished fasttext mat creation'.format(time.time() - start_time))
    #ftmat.head()
    '''
    Crossed columns
    '''

    # my understanding on how to replicate what layers.crossed_column does. One
    # can read here: https://www.tensorflow.org/tutorials/linear.
    def cross_columns(x_cols):
        """simple helper to build the crossed columns in a pandas dataframe
        """
        crossed_columns = dict()
        colnames = ['_'.join(x_c) for x_c in x_cols]
        for cname, x_c in zip(colnames, x_cols):
            crossed_columns[cname] = x_c
        return crossed_columns

    merge['item_condition_id_str'] = merge['item_condition_id'].astype(str)
    merge['shipping_str'] = merge['shipping'].astype(str)
    x_cols = (
        ['brand_name', 'item_condition_id_str'],
        ['brand_name', 'subcat_1'],
        ['brand_name', 'subcat_2'],
        ['brand_name', 'general_cat'],
        #['brand_name',  'subcat_1',  'item_condition_id_str'],
        #['brand_name',  'subcat_2',  'item_condition_id_str'],
        #['brand_name',  'general_cat',  'item_condition_id_str'],
        ['brand_name', 'shipping_str'],
        ['shipping_str', 'item_condition_id_str'],
        ['shipping_str', 'subcat_2'],
        ['item_condition_id_str', 'subcat_2'])
    crossed_columns_d = cross_columns(x_cols)
    categorical_columns = list(merge.select_dtypes(include=['object']).columns)

    D = 2**30
    for k, v in crossed_columns_d.items():
        print('Crossed column ', k)
        outls_ = []
        indicator = 0
        for col in v:
            outls_.append((np.array(merge[col].apply(hash))) % D + indicator)
            indicator += 10**6
        merge[k] = sum(outls_).tolist()
    '''
    Count crossed cols
    '''
    cross_nm = [k for k in crossed_columns_d.keys()]
    lb = LabelBinarizer(sparse_output=True)
    x_col = lb.fit_transform(merge[cross_nm[0]])
    for i in range(1, len(cross_nm)):
        x_col = hstack((x_col, lb.fit_transform(merge[cross_nm[i]])))
    del (lb)
    '''
    Encode Original Strings
    '''
    for col in ['item_description', 'name']:
        lb = LabelBinarizer(sparse_output=True)
        if 'X_orig' not in locals():
            X_orig = lb.fit_transform(merge[col].apply(hash))
        else:
            X_orig = hstack((X_orig, lb.fit_transform(merge[col].apply(hash))))
    X_orig = hstack(
        (X_orig,
         lb.fit_transform(
             (merge['item_description'] + merge['name']).apply(hash))))
    X_orig = hstack((X_orig,
                     lb.fit_transform(
                         (merge['brand_name'] + merge['name']).apply(hash))))
    X_orig = hstack((X_orig,
                     lb.fit_transform(
                         (merge['subcat_2'] + merge['name']).apply(hash))))
    X_orig = hstack((X_orig,
                     lb.fit_transform(
                         (merge['brand_name'] + merge['name'] +
                          merge['item_description']).apply(hash))))
    X_orig = X_orig.tocsr()
    X_orig = X_orig[:,
                    np.
                    array(np.clip(X_orig.getnnz(axis=0) -
                                  2, 0, 1), dtype=bool)]
    X_orig = X_orig[:,
                    np.array(np.clip(X_orig.getnnz(axis=0) - 5000, 1, 0),
                             dtype=bool)]
    print('Shape of original hash', X_orig.shape)
    X_orig = X_orig.tocoo()
    gc.collect()
    cpuStats()
    '''
    Hash name
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.5, 1.0],
                                 "hash_size": 2**29,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_name = wb.fit_transform(merge['name'])
    del (wb)
    X_name = X_name[:,
                    np.
                    array(np.clip(X_name.getnnz(axis=0) -
                                  1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `name` completed.'.format(time.time() - start_time))
    '''
    Hash category
    '''

    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**20,
                                 "norm": None,
                                 "tf": 'binary',
                                 "idf": None,
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    cat = merge["category_name"].str.replace('/', ' ')
    X_cat = wb.fit_transform(cat)
    del (wb)
    X_cat = X_cat[:,
                  np.array(np.clip(X_cat.getnnz(axis=0) -
                                   1, 0, 1), dtype=bool)]
    print('[{}] Vectorize `category` completed.'.format(time.time() -
                                                        start_time))
    '''
    Count category
    '''

    wb = CountVectorizer()
    X_category1 = wb.fit_transform(merge['general_cat'])
    X_category2 = wb.fit_transform(merge['subcat_1'])
    X_category3 = wb.fit_transform(merge['subcat_2'])
    print('[{}] Count vectorize `categories` completed.'.format(time.time() -
                                                                start_time))

    # wb= wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 3, "hash_ngrams_weights": [1.0, 1.0, 0.5],
    wb = wordbatch.WordBatch(normalize_text,
                             extractor=(WordBag, {
                                 "hash_ngrams": 2,
                                 "hash_ngrams_weights": [1.0, 1.0],
                                 "hash_size": 2**28,
                                 "norm": "l2",
                                 "tf": 1.0,
                                 "idf": None
                             }),
                             procs=8)
    wb.dictionary_freeze = True
    X_description = wb.fit_transform(merge['item_description'])
    del (wb)
    X_description = X_description[:,
                                  np.array(np.clip(
                                      X_description.getnnz(axis=0) - 1, 0, 1),
                                           dtype=bool)]
    print('[{}] Vectorize `item_description` completed.'.format(time.time() -
                                                                start_time))

    lb = LabelBinarizer(sparse_output=True)
    X_brand = lb.fit_transform(merge['brand_name'])
    print('[{}] Label binarize `brand_name` completed.'.format(time.time() -
                                                               start_time))

    X_dummies = csr_matrix(
        pd.get_dummies(merge[['item_condition_id', 'shipping']],
                       sparse=True).values)

    print('[{}] Get dummies on `item_condition_id` and `shipping` completed.'.
          format(time.time() - start_time))
    print(X_dummies.shape, X_description.shape, X_brand.shape,
          X_category1.shape, X_category2.shape, X_category3.shape,
          X_name.shape, X_cat.shape, x_col.shape, X_orig.shape)
    sparse_merge = hstack(
        (X_dummies, X_description, X_brand, X_category1, X_category2,
         X_category3, X_name, X_cat, x_col, X_orig, ftmat)).tocsr()

    print('[{}] Create sparse merge completed'.format(time.time() -
                                                      start_time))

    # Remove features with document frequency <=1 in the training set
    print(sparse_merge.shape)
    mask = np.array(np.clip(
        sparse_merge[:nrow_train][trnidx].getnnz(axis=0) - 1, 0, 1),
                    dtype=bool)
    sparse_merge = sparse_merge[:, mask]
    X = sparse_merge[:nrow_train]
    X_test = sparse_merge[nrow_test:]
    print(sparse_merge.shape)

    gc.collect()
    if develop:
        #train_X1, valid_X1, train_y1, valid_y1 = train_test_split(X, y, train_size=0.90, random_state=233)
        train_X, valid_X, train_y, valid_y = X[trnidx], X[validx], y.values[
            trnidx], y.values[validx]

    model = FM_FTRL(alpha=0.01,
                    beta=0.01,
                    L1=0.00001,
                    L2=0.1,
                    D=sparse_merge.shape[1],
                    alpha_fm=0.01,
                    L2_fm=0.0,
                    init_fm=0.01,
                    D_fm=200,
                    e_noise=0.0001,
                    iters=1,
                    inv_link="identity",
                    threads=threads)  #iters=15

    baseline = 1.
    for i in range(15):
        model.fit(train_X, train_y, verbose=1)
        predsfm = model.predict(X=valid_X)
        score_ = rmsle(np.expm1(valid_y), np.expm1(predsfm))
        print("FM_FTRL dev RMSLE:", score_)
        if score_ < baseline:
            baseline = score_
        else:
            break

    print('[{}] Train ridge v2 completed'.format(time.time() - start_time))
    if develop:
        predsfm = model.predict(X=valid_X)
        print("FM_FTRL dev RMSLE:", rmsle(np.expm1(valid_y),
                                          np.expm1(predsfm)))
        # 0.44532
        # Full data 0.424681

    predsFM = model.predict(X_test)
    print('[{}] Predict FM_FTRL completed'.format(time.time() - start_time))

    return merge, trnidx, validx, nrow_train, nrow_test, glove_file, predsFM, predsfm
class Preprocessing:
    def __init__(self):
        start_time = time.time()
        # self.model = FastText("../data/input/models/sg_pyfasttext.bin")  # DEBUG
        self.model = FastText(
            "../data/input/models/880w_fasttext_skip_gram.bin")
        end_time = time.time()
        print(f"Loading word vector model cost: {end_time - start_time:.2f}s")

        # self.vocab_size, self.vector_size = self.model.numpy_normalized_vectors.shape  # OK
        self.vocab_size = self.model.nwords
        self.vector_size = self.model.args.get("dim")
        # self.vector_size:200, self.vocab_size: 925242
        print(
            f"self.vector_size:{self.vector_size}, self.vocab_size: {self.vocab_size}"
        )

        # 句子的表示形式:
        # {"avg": 向量和的平均, "fasttext": get_numpy_sentence_vector, "concatenate": 向量拼接和补齐, "matrix": 矩阵}
        self.sentence_vec_type = "matrix"

        self.MAX_SENT_LEN = 70  # DEBUG: 超参数. self.get_sent_max_length()
        # 对于"concatenate": self.MAX_SENT_LEN = 30, 取其他不同值的结果: 100: 50.22%, 80: 50.23%, 70: 50.33%, 60: 55.92%, 50: 69.11%, 40: 68.91%, 36: 69.34%, 30: 69.22%, 20: 69.17%, 10: 67.07%
        # 对于"matrix": self.MAX_SENT_LEN = 70, 取其他不同值的结果: TODO:

    @classmethod
    def data_analysis(cls):
        train_df = pd.read_csv("../data/input/training_set.txt",
                               sep="\t",
                               header=None,
                               names=["label", "sentence"])
        val_df = pd.read_csv("../data/input/validation_set.txt",
                             sep="\t",
                             header=None,
                             names=["label", "sentence"])
        y_train = train_df["label"]
        y_val = val_df["label"]
        sns.set(style="white", context="notebook", palette="deep")
        # 查看样本数据分布情况(各个label数据是否均匀分布)
        sns.countplot(y_train)
        plt.show()
        sns.countplot(y_val)
        plt.show()
        print(y_train.value_counts())
        print(y_val.value_counts())

    def set_sent_vec_type(self, sentence_vec_type):
        assert sentence_vec_type in ["avg", "concatenate", "fasttext", "matrix"], \
            "sentence_vec_type must be in ['avg', 'fasttext', 'concatenate', 'matrix']"
        self.sentence_vec_type = sentence_vec_type

    def get_sent_max_length(self):  # NOT_USED
        sent_len_counter = Counter()
        max_length = 0
        with open("../data/input/training_set.txt") as f:
            for line in f:
                content = line.strip().split("\t")[1]
                content_list = content.split()
                length = len(content_list)
                sent_len_counter[length] += 1
                if max_length <= length:
                    max_length = length
        sent_len_counter = sorted(list(sent_len_counter.items()),
                                  key=lambda x: x[0])
        print(sent_len_counter)
        # [(31, 1145), (32, 1105), (33, 1017), (34, 938), (35, 839), (36, 830), (37, 775), (38, 737), (39, 720), (40, 643), (41, 575), (42, 584), (43, 517), (44, 547), (45, 514), (46, 514), (47, 480), (48, 460), (49, 470), (50, 444), (51, 484), (52, 432), (53, 462), (54, 495), (55, 487), (56, 500), (57, 496), (58, 489), (59, 419), (60, 387), (61, 348), (62, 265), (63, 222), (64, 153), (65, 127), (66, 103), (67, 67), (68, 34), (69, 21), (70, 22), (71, 8), (72, 6), (73, 4), (74, 10), (75, 2), (76, 4), (77, 2), (78, 1), (79, 2), (80, 4), (81, 2), (82, 3), (83, 1), (84, 5), (86, 4), (87, 3), (88, 3), (89, 2), (90, 2), (91, 3), (92, 5), (93, 2), (94, 4), (96, 1), (97, 5), (98, 1), (99, 2), (100, 2), (101, 2), (102, 1), (103, 2), (104, 2), (105, 2), (106, 5), (107, 3), (108, 2), (109, 3), (110, 4), (111, 1), (112, 2), (113, 3), (114, 1), (116, 1), (119, 3), (679, 1)]
        return max_length

    def gen_sentence_vec(self, sentence):
        """
        :param sentence: 
        :return: 
        """
        sentence = sentence.strip()
        if self.sentence_vec_type == "fasttext":
            return self.model.get_numpy_sentence_vector(sentence)

        word_list = sentence.split(" ")
        if self.sentence_vec_type == "concatenate":
            sentence_vector = self.model.get_numpy_vector(word_list[0])
            for word in word_list[1:]:
                sentence_vector = np.hstack(
                    (sentence_vector, self.model.get_numpy_vector(word)))
            return sentence_vector  # NOTE: 对于concatenate情况, 每个句子的sentence_vector是不一样长的
        if self.sentence_vec_type == "matrix":  # for Deep Learning.
            sentence_matrix = []
            for word in word_list[
                    -self.
                    MAX_SENT_LEN:]:  # NOTE: 截取后面的应该是要好些(参考https://github.com/lxw0109/SentimentClassification_UMICH_SI650/blob/master/src/LSTM_wo_pretrained_vector.py#L86)
                sentence_matrix.append(self.model.get_numpy_vector(word))
            length = len(sentence_matrix)
            # 一定成立,因为上面做了切片截取
            assert length <= self.MAX_SENT_LEN, "CRITICAL ERROR: len(sentence_matrix) > self.MAX_SENT_LEN."
            # 参数中的matrix类型为list of ndarray, 返回值的matrix是ndarray of ndarray
            sentence_matrix = np.pad(sentence_matrix,
                                     pad_width=((0,
                                                 self.MAX_SENT_LEN - length),
                                                (0, 0)),
                                     mode="constant",
                                     constant_values=-1)
            return sentence_matrix
        else:  # self.sentence_vec_type == "avg":
            sentence_vector = np.zeros(self.vector_size)  # <ndarray>
            # print(f"type(sentence_vector): {type(sentence_vector)}")
            for idx, word in enumerate(word_list):
                # print(f"type(self.model.get_numpy_vector(word)): {type(self.model.get_numpy_vector(word))}")  # <ndarray>
                sentence_vector += self.model.get_numpy_vector(word)
            return sentence_vector / len(word_list)

    def gen_train_val_data(self):
        # 构造训练数据 & 验证数据
        train_df = pd.read_csv("../data/input/training_set.txt",
                               sep="\t",
                               header=None,
                               names=["label", "sentence"])
        val_df = pd.read_csv("../data/input/validation_set.txt",
                             sep="\t",
                             header=None,
                             names=["label", "sentence"])
        # 打乱训练集的顺序. TODO: 不打乱感觉训练出来的模型是有问题的?(好看那句总是预测结果是1?)
        train_df = train_df.sample(frac=1, random_state=1)
        # val_df = val_df.sample(frac=1, random_state=1)  # 验证集不用打乱

        X_train = train_df["sentence"]
        X_train_vec = list()
        for sentence in X_train:
            sent_vector = self.gen_sentence_vec(sentence)
            X_train_vec.append(sent_vector)
        y_train = train_df["label"]  # <Series>

        X_val = val_df["sentence"]
        X_val_vec = list()
        for sentence in X_val:
            sent_vector = self.gen_sentence_vec(sentence)
            X_val_vec.append(sent_vector)
        y_val = val_df["label"]  # <Series>

        if self.sentence_vec_type == "concatenate":
            # NOTE: 注意,这里的dtype是必须的,否则dtype默认值是"int32", 词向量所有的数值会被全部转换为0
            X_train_vec = sequence.pad_sequences(X_train_vec,
                                                 maxlen=self.MAX_SENT_LEN *
                                                 self.vector_size,
                                                 value=0,
                                                 dtype=np.float)
            X_val_vec = sequence.pad_sequences(X_val_vec,
                                               maxlen=self.MAX_SENT_LEN *
                                               self.vector_size,
                                               value=0,
                                               dtype=np.float)

        return np.array(X_train_vec), np.array(X_val_vec), np.array(
            y_train), np.array(y_val)