Example #1
0
 def train(self):
     logging.info(' train tfidf model ... ')
     self.tfidf = models.TfidfModel(self.corpus, normalize=True)
     logging.info(' train word2vec model ... ')
     self.w2v = models.Word2Vec(min_count=2,
                                window=2,
                                size=300,
                                sample=6e-5,
                                alpha=0.03,
                                min_alpha=0.0007,
                                negative=15,
                                workers=4,
                                iter=7)
     self.w2v.train(self.data,
                    total_examples=self.w2v.corpus_count,
                    epochs=15,
                    report_delay=1)
     logging.info(' train fasttext model ... ')
     self.fast = models.FastText(self.data,
                                 size=300,
                                 window=3,
                                 min_count=1,
                                 iter=10,
                                 min_n=3,
                                 max_n=6,
                                 word_ngrams=2)
Example #2
0
    def fit(self, X, y=None):
        """Fit the model according to the given training data.

        Parameters
        ----------
        X : iterable of iterables of str
            Can be simply a list of lists of tokens, but for larger corpora,
            consider an iterable that streams the sentences directly from disk/network.
            See :class:`~gensim.models.word2vec.BrownCorpus`, :class:`~gensim.models.word2vec.Text8Corpus`
            or :class:`~gensim.models.word2vec.LineSentence` in :mod:`~gensim.models.word2vec` module for such examples.

        Returns
        -------
        :class:`~gensim.sklearn_api.ftmodel.FTTransformer`
            The trained model.

        """
        self.gensim_model = models.FastText(
                sentences=X, sg=self.sg, hs=self.hs, vector_size=self.vector_size,
                alpha=self.alpha, window=self.window, min_count=self.min_count,
                max_vocab_size=self.max_vocab_size, word_ngrams=self.word_ngrams,
                sample=self.sample, seed=self.seed, workers=self.workers,
                min_alpha=self.min_alpha, negative=self.negative,
                ns_exponent=self.ns_exponent, cbow_mean=self.cbow_mean,
                hashfxn=self.hashfxn, epochs=self.epochs, null_word=self.null_word,
                min_n=self.min_n, max_n=self.max_n, sorted_vocab=self.sorted_vocab,
                bucket=self.bucket, trim_rule=self.trim_rule,
                batch_words=self.batch_words
        )
        return self
Example #3
0
    def trainer(self):
        '''
        @description: Train tfidf,  word2vec, fasttext and autoencoder
        @param {type} None
        @return: None
        '''
        logger.info('train tfidf')
        count_vect = TfidfVectorizer(stop_words=self.stopWords,
                                     max_df=0.4,
                                     min_df=0.001,
                                     ngram_range=(1, 2))
        self.tfidf = count_vect.fit(self.data["text"])
        logger.info('train word2vec')

        self.data['text'] = self.data["text"].apply(lambda x: x.split(' '))
        self.w2v = models.Word2Vec(min_count=2,
                                   window=5,
                                   size=300,
                                   sample=6e-5,
                                   alpha=0.03,
                                   min_alpha=0.0007,
                                   negative=15,
                                   workers=4,
                                   iter=30,
                                   max_vocab_size=50000)
        self.w2v.build_vocab(self.data["text"])
        self.w2v.train(self.data["text"],
                       total_examples=self.w2v.corpus_count,
                       epochs=15,
                       report_delay=1)

        logger.info('train fast')
        # 训练fast的词向量
        self.fast = models.FastText(
            self.data["text"],
            size=300,  # 向量维度
            window=3,  # 移动窗口
            alpha=0.03,
            min_count=2,  # 对字典进行截断, 小于该数的则会被切掉,增大该值可以减少词表个数
            iter=30,  # 迭代次数
            max_n=3,
            word_ngrams=2,
            max_vocab_size=50000)

        logger.info('train lda')
        self.id2word = gensim.corpora.Dictionary(self.data.text)
        corpus = [self.id2word.doc2bow(text) for text in self.data.text]
        self.LDAmodel = LdaMulticore(corpus=corpus,
                                     id2word=self.id2word,
                                     num_topics=30,
                                     workers=2,
                                     chunksize=4000,
                                     passes=7,
                                     alpha='asymmetric')

        logger.info('train autoencoder')
        self.ae.train(self.data)
Example #4
0
def main():
    import sys
    from pprint import pprint
    corpus_path = sys.argv[1]
    corpus = Corpus(corpus_path)
    w2v = m.Word2Vec(sentences=corpus)
    print()
    print("Word2Vec vectors:")
    pprint(w2v.wv.vectors)
    print("=" * 80)
    w2v.save(f"{corpus_path}.w2v")
    ft = m.FastText(sentences=corpus)
    print()
    print("FastText vectors:")
    pprint(ft.wv.vectors)
    print()
    ft.save(f"{corpus_path}.ft")
Example #5
0
    def train_model(self, training_sample: TrainingSample):
        self.training_sample = training_sample
        self.documents = training_sample.get_documents()
        self.dictionary = training_sample.dictionary
        self.corpus = training_sample.corpus
        print('\nfastText model: Обучаем модель...')
        start_time = time.time()
        self.model = models.FastText(self.documents, sg=1, hs=1, size=100, alpha=0.025,
                                     window=5, min_count=3, workers=3, min_alpha=0.0001,
                                     negative=10, cbow_mean=1, iter=10, min_n=3, max_n=6,
                                     sorted_vocab=0)
        print('Learning time:', round((time.time() - start_time), 3), 's')

        print('\nFastText model: Building ft matrix...')
        start_time = time.time()
        self.similarity_matrix = self.model.wv.similarity_matrix(
            self.training_sample.dictionary)  # construct similarity matrix

        print('FastText matrix time:', round((time.time() - start_time), 3), 's')
data = [
    'I love machine learning', 'I like reading books', 'Python is beautiful',
    'R is horrible.', 'Machine learning is cool!', 'I really like NLP'
]

# pre-process our text
text = [re.sub(r'([^\s\w]|_)+', '', sentence) for sentence in data]
text = [sentence.lower().split() for sentence in text]

# train Word2Vec model on our data
word_model = models.Word2Vec(text, size=50, min_count=1, iter=100)
print(word_model.wv.vectors)
print(word_model.wv.vocab)
print(len(word_model.wv.vocab))
print(word_model.wv['like'])
word_model.wv.most_similar('python')

# train Fast model on our data
word_model = models.FastText(text, size=50, min_count=1, iter=100)
print(word_model.wv.vectors)
print(word_model.wv.vocab)
print(word_model.wv['like'])
word_model.wv.most_similar('python')

word_vectors = list(word_model.wv.vectors)
labels = list(word_model.wv.vocab)
kutils.viz_vectors(word_vectors, labels)
kutils.viz_vectors_corr(word_vectors, labels)
kutils.viz_vectors_lower_dim(word_vectors, labels)
Example #7
0
def get_embedding_pro(df_raw,
                      sentence_id,
                      word_id,
                      emb_size=128,
                      window=10,
                      dropna=False,
                      n_jobs=4,
                      method='skipgram',
                      hs=0,
                      negative=10,
                      epoch=10,
                      return_model=False,
                      embedding_type='fasttext',
                      slide_window=1):
    """
    Now, set min_count=1 to avoid OOV...
    How to deal with oov in a more appropriate way...
    Paramter:
    ----------
    df_raw: DataFrame contains columns named sentence_id and word_id
    sentence_id: like user ID, will be coerced into str
    word_id: like item ID, will be coerced into str
    emb_size: default 8
    dropna: default False, nans will be filled with 'NULL_zhangqibot'. if True, nans will all be dropped.
    n_jobs: 4 cpus to use as default
    method: 'sg'/'skipgram' or 'cbow'
        sg : {0, 1}, optional
            Training algorithm: 1 for skip-gram; otherwise CBOW.
    hs : {0, 1}, optional
        If 1, hierarchical softmax will be used for model training.
        If 0, and `negative` is non-zero, negative sampling will be used.
    negative : int, optional
        If > 0, negative sampling will be used, the int for negative specifies how many "noise words"
        should be drawn (usually between 5-20).
        If set to 0, no negative sampling is used.
    epoch: iter : int, optional,default 10
        Number of iterations (epochs) over the corpus.
    return_model: default True
    embedding_type: fasttext word2vec
    Return:
    ----------
    Example:
    def run_w2v(sentence_id,word_id,emb_size=128):
        res_dict= w2v_pro(datalog,sentence_id=sentence_id,word_id=word_id,
                          emb_size=emb_size,dropna=False,n_jobs=-1, 
                          method='cbow', hs=0,negative=10,epoch=10,
                          return_model=False)
        Cache.cache_data(res_dict,nm_marker=f'EMB_DICT_W2V_CBOW_10EPOCH_{sentence_id}_{word_id}')

    sentence_id='user_id'
    for word_id in tqdm(['creative_id', 'ad_id', 'product_id', 'advertiser_id']):
        run_w2v(sentence_id,word_id,emb_size=128)

    run_w2v(sentence_id,word_id='product_category',emb_size=8)
    run_w2v(sentence_id,word_id='industry',emb_size=64)
    ----------
    """
    if method.lower() in ['sg', 'skipgram']:
        sg = 1
    elif method.lower() in ['cbow']:
        sg = 0
    else:
        raise NotImplementedError
    list_col_nm = f'{sentence_id}__{word_id}_list'
    if (n_jobs is None) or (n_jobs <= 0):
        n_jobs = multiprocessing.cpu_count()
    print(f"========== W2V:  {sentence_id} {word_id} ==========")

    df = df_raw[[sentence_id, word_id, 'pt_d']].copy()

    if df[sentence_id].isnull().sum() > 0:
        print("NaNs exist in sentence_id column!!")
    if dropna:
        df = df.dropna(subset=[sentence_id, word_id])
    else:
        df[word_id] = df[word_id].fillna(-1).astype(int).astype(str)
        df[sentence_id] = df[sentence_id].fillna(-1).astype(int).astype(str)

    df['pt_d_last'] = df['pt_d'] + slide_window
    fe = df.groupby([sentence_id, 'pt_d_last'
                     ])[word_id].apply(lambda x: list(x)).reset_index()
    fe.columns = [sentence_id, 'pt_d', list_col_nm]
    df = df.merge(fe, on=[sentence_id, 'pt_d'], how='left')
    df[list_col_nm] = df[list_col_nm].map(lambda x: x
                                          if isinstance(x, list) else [])
    # 加上本行的
    df[word_id + '_add'] = df[word_id].map(lambda x: [x])
    df[list_col_nm] = df[list_col_nm] + df[word_id + '_add']
    sentences = df[list_col_nm].values.tolist()
    all_words_vocabulary = df[word_id].unique().tolist()
    del df[list_col_nm], df['pt_d_last'], df[word_id + '_add']
    gc.collect()
    if embedding_type == 'w2v':
        model = Word2Vec(
            sentences,
            size=emb_size,
            window=window,
            workers=n_jobs,
            min_count=1,  # 最低词频. min_count>1会出现OOV
            sg=sg,  # 1 for skip-gram; otherwise CBOW.
            hs=hs,  # If 1, hierarchical softmax will be used for model training
            negative=negative,  # hs=1 + negative 负采样
            iter=epoch,
            seed=0)
    else:
        model = models.FastText(sentences,
                                size=emb_size,
                                window=window,
                                workers=n_jobs,
                                seed=0,
                                sg=sg,
                                iter=epoch)

    # get word embedding matrix
    emb_dict = {}
    for word_i in all_words_vocabulary:
        if word_i in model.wv:
            emb_dict[word_i] = model.wv[word_i]
        else:
            emb_dict[word_i] = np.zeros(emb_size)

    # get sentence embedding matrix
    emb_matrix = []
    for seq in sentences:
        vec = []
        for w in seq:
            if w in model.wv:
                vec.append(model.wv[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)
    emb_matrix = np.array(emb_matrix)
    emb_cols = []
    for i in range(emb_size):
        df[f'EMB_{embedding_type}_{sentence_id}_{word_id}_{slide_window}_emb_{i}'] = emb_matrix[:,
                                                                                                i]
        emb_cols.append(
            f'EMB_{embedding_type}_{sentence_id}_{word_id}_{slide_window}_emb_{i}'
        )

    if not return_model:
        model = None
    return {
        "word_emb_dict": emb_dict,
        "sentence_emb_df": df[emb_cols],
        'model': model
    }
Example #8
0
 def trainer_fasttext(self, path):
     print('train fasttext')
     corpus = get_corpus(path, w2v=True)
     fast = models.FastText(corpus, size=300, window=3, min_count=2)
     return fast
Example #9
0
    def trainer(self):
        '''
        @description: Train tfidf,  word2vec, fasttext and autoencoder
        @param {type} None
        @return: None
        '''

        logger.info('train tfidf')
        '''
        TfidfVectorizer:使用方法

        stop_words: 单词的列表,eg: ['word1', 'word2']
        max_df : 最大文档频率,eg:总共5个文档,出现在其中4个文档,df=0.8 超过就过滤
        min_df : 同上
        ngram_range: tuple,(1,2)可以是单个词,也可是两个词 进入到统计范畴


        fit(): 
        Fit the vectorizer/model to the training data and save the vectorizer/model 
        to a variable (returns sklearn.feature_extraction.text.TfidfVectorizer)
        输入形式:
        * ['doc1', 'doc2', ..] docN都是分好词的,使用空格分割;list;
        * dataframe 某个列字段

        transform(): 
        Use the variable output from fit() to transformer validation/test data 
        (returns scipy.sparse.csr.csr_matrix)
        
        vocabulary_: 词语与列的对应关系

        ref:
        https://stackoverflow.com/questions/53027864/what-is-the-difference-between-tfidfvectorizer-fit-transfrom-and-tfidf-transform
        https://blog.csdn.net/blmoistawinde/article/details/80816179
        '''
        count_vect = TfidfVectorizer(stop_words=self.stopWords,
                                     max_df=0.4,
                                     min_df=0.001,
                                     ngram_range=(1, 2))
        self.tfidf = count_vect.fit(self.data["text"])  #直接支持了dataframe列索引的数据格式
        '''
        Word2Vec 参数说明:
        sg=1是skip-gram算法,对低频词敏感;默认sg=0为CBOW算法。
        size是输出词向量的维数,值太小会导致词映射因为冲突而影响结果,值太大则会耗内存并使算法计算变慢,一般值取为100到200之间。
        window是句子中当前词与目标词之间的最大距离,3表示在目标词前看3-b个词,后面看b个词(b在0-3之间随机)。
        min_count是对词进行过滤,频率小于min-count的单词则会被忽视,默认值为5。(0-100)
        sample(float) - 用于配置哪些较高频率的词随机下采样的阈值,有用范围是(0,1e-5)
        hs=1表示层级softmax将会被使用,默认hs=0且negative不为0,则负采样将会被选择使用。
        workers,使用线程数。控制训练的并行,此参数只有在安装了Cpython后才有效,否则只能使用单核。

        train:

        total_examples (int) – 统计句子数量
        report_delay (float) – 进度报告等待的秒数

        ref:https://www.jianshu.com/p/b996e7e0d0b0

        '''

        logger.info('train word2vec')
        self.data['text'] = self.data["text"].apply(
            lambda x: x.split(' '))  ##[[word1,word2],[wordn..]]
        self.w2v = models.Word2Vec(min_count=2,
                                   window=5,
                                   size=300,
                                   sample=6e-5,
                                   alpha=0.03,
                                   min_alpha=0.0007,
                                   negative=15,
                                   workers=4,
                                   iter=30,
                                   max_vocab_size=50000)
        self.w2v.build_vocab(self.data["text"])
        self.w2v.train(self.data["text"],
                       total_examples=self.w2v.corpus_count,
                       epochs=15,
                       report_delay=1)

        logger.info('train fast')
        # 训练fast的词向量
        self.fast = models.FastText(
            self.data["text"],
            size=300,  # 向量维度
            window=3,  # 移动窗口
            alpha=0.03,
            min_count=2,  # 对字典进行截断, 小于该数的则会被切掉,增大该值可以减少词表个数
            iter=30,  # 迭代次数
            max_n=3,
            word_ngrams=2,
            max_vocab_size=50000)
        '''
        了解下gensim: https://zhuanlan.zhihu.com/p/37175253
        Gensim是一款开源的第三方Python工具包,用于从原始的非结构化的文本中,无监督地学习到文本隐层的主题向量表达;

        1、corpora.Dictionary 对象 : word <-> id 之间的映射
        Dictionary(documents=None, prune_at=2000000)
        字典封装了在归一化词汇(word)与整型id之间的映射关系。
        主要函数有 doc2bow,它将许多词汇转换成词袋(bag-of-words)模型表示:一个2-tuples列表(word_id, word_frequency)。
        如果给定了documents,使用它们进行字典初始化(参见:add_documents())

        输入:data.text  二维[[doc1_A,doc1_B][doc2_C,doc2_D]]矩阵 

        2、j 
        https://radimrehurek.com/gensim/models/ldamulticore.html

        '''
        logger.info('train lda')
        self.id2word = gensim.corpora.Dictionary(self.data.text)  #建立语料特征的索引字典
        corpus = [self.id2word.doc2bow(text)
                  for text in self.data.text]  #词袋模型的稀疏向量
        self.LDAmodel = LdaMulticore(corpus=corpus,
                                     id2word=self.id2word,
                                     num_topics=30,
                                     workers=4,
                                     chunksize=4000,
                                     passes=7,
                                     alpha='asymmetric')
        ##lstm
        logger.info('train autoencoder')
        self.ae.train(self.data)
Example #10
0
    'wikileaks',
    con,
)


class MySentences(object):
    def __init__(self):
        self.con = create_engine('sqlite://///home/luiztheodoro/Documentos/mestrado/iri/trab_final/wikileaks.sqlite', \
                    echo = False)
        self.df = pd.read_sql(
            'wikileaks',
            con,
        )

    def __iter__(self):
        for index, row in self.df.iterrows():
            yield row['lista'].split(" ")


sentences = list(MySentences())

print("Gerando FastText...")
model_ft = models.FastText(size=100)
model_ft.build_vocab(sentences)
model_ft.train(sentences = sentences, \
                epochs = model_ft.epochs, \
                total_examples = model_ft.corpus_count, \
                total_words = model_ft.corpus_total_words)

model_ft.save(
    '/home/luiztheodoro/Documentos/mestrado/iri/trab_final/wikileaks.ft')
# turn our tokenized documents into a id <-> term dictionary
if not os.path.isfile('./dictionary.dict'):
    print 'Turn our tokenized documents into a id <-> term dictionary ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary(texts)
    dictionary.save('./dictionary.dict')
else:
    print 'Loading id <-> term dictionary from ./dictionary.dict ...',
    sys.stdout.flush()
    dictionary = corpora.Dictionary.load('./dictionary.dict')
print ' Done!'

# ignore words that appear in less than 20 documents or more than 50% documents
print "Filtering less and more frequent words ..."
dictionary.filter_extremes(no_below=2, no_above=0.5)
for i, text in enumerate(texts):
    filtered_text = []
    for w in text:
        if w in dictionary.token2id: filtered_text.append(w)
    texts[i] = filtered_text

del dictionary

# Learn the FastText model
print 'Learning the FastText model ...',
sys.stdout.flush()
fasttextmodel = models.FastText(texts, size=NUM_TOPICS, workers=8, iter=25, window=8)
fasttextmodel.save('fasttextmodel'+str(NUM_TOPICS)+'.fasttext')
print ' Done!'