Beispiel #1
0
    def build_gensim(self, docs, model=None):
        dp = DocumentPreprocessor()
        docs_tokenized = (dp.tokenizer(doc) for doc in docs)

        # Get the word co-occurrence matrix -- needs lots of RAM!!
        cooccur = glove.Corpus()
        cooccur.fit(docs_tokenized, window=10)

        # wiki_generator = lambda: (filter_text(text) for text in wiki)
        # cooccur.fit(wiki_generator(), window=10)

        # and train GloVe model itself, using 10 epochs
        if model is None:
            model = glove.Glove(no_components=600, learning_rate=0.05)
        model.fit(cooccur.matrix, epochs=10)

        doc_vectors = []
        docs_tokenized = (dp.tokenizer(doc) for doc in docs)
        for doc in docs_tokenized:
            doc_vector = np.zeros(len(model.word_vectors[0]), dtype=np.float)
            if len(doc):
                for word in doc:
                    try:
                        doc_vector += model[word]
                    except:
                        log.debug(
                            'Word: {} doesn\'t appear in model.'.format(word))
            else:
                log.debug('Empty document in data')
            doc_vectors.append(doc_vector)

        return np.array(doc_vectors), model
Beispiel #2
0
    def train_all(self):
        """
        builds the vocab and trains the model
        :return:
        """
        documents = list(self.read_input())
        corpus = glove.Corpus()
        corpus.fit(documents, window=self.window_size)
        self.model = glove.Glove(no_components=self.vector_size, learning_rate=self.learning_rate, alpha=self.alpha)

        self.model.fit(corpus.matrix, epochs=self.iterations, no_threads=self.workers, verbose=True)
        self.dictionary=corpus.dictionary
        self.model.add_dictionary(corpus.dictionary)
Beispiel #3
0
    def fit(self, corpus=None, size=2):  # , distance_metric=0, zero_out_diag=False):

        if corpus is not None:
            self.corpus = corpus

        assert self.token2id is not None, "Fit with no vocabulary!"
        assert self.corpus is not None, "Fit with no corpus!"

        glove_corpus = glove.Corpus(dictionary=self.token2id)
        glove_corpus.fit(corpus, window=size)

        self.nw_xy = glove_corpus.matrix

        return self
def train(word2id, id2word, corpus, win, dim):
    cooccur = glove.Corpus(dictionary=word2id)
    cooccur.fit(corpus(), window=win)

    logger.info("glove model creating")
    logger.info('Dict size: %s' % len(cooccur.dictionary))
    logger.info('Collocations: %s' % cooccur.matrix.nnz)
    model = glove.Glove(no_components=dim, learning_rate=0.05)
    model.fit(cooccur.matrix, epochs=10, no_threads=5, verbose=True)
    model.add_dictionary(cooccur.dictionary)
    model.word2id = dict(
        (utils.to_unicode(w), id) for w, id in model.dictionary.items())
    model.id2word = gensim.utils.revdict(model.word2id)
    utils.pickle(model, './model/glove.model')
Beispiel #5
0
def glove2vec(text_sentence,
              win=10,
              noc=1,
              lr=0.05,
              epochs=10,
              nothr=1,
              verbose=True):
    corpus_model = glove.Corpus()
    corpus_model.fit(text_sentence, window=win)
    word_list = glove.Glove(no_components=noc, learning_rate=lr)
    word_list.fit(corpus_model.matrix,
                  epochs=epochs,
                  no_threads=nothr,
                  verbose=verbose)
    word_list.add_dictionary(corpus_model.dictionary)
    return word_list
Beispiel #6
0
    def generate_embedding_dictionary(docs_tokens, embedding_dim, iters, window=2, learning_rate=0.05):
        time_start = time()
        corpus_model = glove.Corpus()
        corpus_model.fit(docs_tokens, window=window)
        glove_model = glove.Glove(no_components=embedding_dim, learning_rate=learning_rate)
        glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4)
        end_time = time()
        glove_model.add_dictionary(corpus_model.dictionary)

        word_to_index = glove_model.dictionary
        index_word = glove_model.inverse_dictionary
        embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)}

        # embedding_dictionary["<<UNKNOWN>>"] = np.zeros(embedding_dim)

        return embedding_dictionary, embedding_dim, word_to_index, end_time - time_start
Beispiel #7
0
def glove(windows,
          num_components=16,
          glove_window=10,
          epochs=20,
          verbose=False):
    import glove
    import hdbscan
    import multiprocessing

    ws = [[template_id for template_id in w] for w in windows]
    corpus = glove.Corpus()
    corpus.fit(ws, window=glove_window)
    # TODO: Explore reasonable glove defaults
    glove_model = glove.Glove(no_components=num_components, learning_rate=0.05)
    glove_model.fit(corpus.matrix,
                    epochs=epochs,
                    no_threads=multiprocessing.cpu_count(),
                    verbose=verbose)
    glove_model.add_dictionary(corpus.dictionary)

    labels = []
    vectors = []
    # TODO: Explore how to pull data more nicely from glove
    for key in glove_model.__dict__['dictionary']:
        word_vector_index = glove_model.__dict__['dictionary'][key]
        labels.append(key)
        vectors.append(
            list(glove_model.__dict__['word_vectors'][word_vector_index]))

    # Clustering
    output_events = defaultdict(list)
    for i, val in enumerate(
            hdbscan.HDBSCAN(min_cluster_size=2).fit_predict(vectors)):
        output_events[val].append(labels[i])

    # Create event objects
    events = []
    for item in output_events:
        event = Event(id=str(uuid.uuid4()),
                      template_ids=map(int, output_events[item]))
        if len(event.template_ids) > 0:
            events.append(event)
    return events
Beispiel #8
0
    def __init__(self, docs_tokens, emb_dim, iters, window, learn_rate):
        self.time = 0.

        self.time = time()

        corpus_model = glove.Corpus()
        corpus_model.fit(docs_tokens, window=window)
        glove_model = glove.Glove(no_components=emb_dim, learning_rate=learn_rate)
        glove_model.fit(corpus_model.matrix, epochs=iters, no_threads=4)
        glove_model.add_dictionary(corpus_model.dictionary)

        self.time = time() - self.time

        word_to_index = glove_model.dictionary
        index_word = glove_model.inverse_dictionary
        embedding_dictionary = {index_word[i]: vector for i, vector in enumerate(glove_model.word_vectors)}

        super(EmbeddingModel, self).get_from_data(embedding_dictionary, emb_dim, word_to_index, self)

        self.name = 'glove'
Beispiel #9
0
def train_glove(sentences=None, nr_feature=None, save_name=None):
    verify_cwd()
    if sentences is None:
        print("preprocessing sentences...")
        sentences = list(
            itertools.islice(word2vec.Text8Corpus('./data/text8'), None))
        print("{} sentences found.".format(len(sentences)))
    if save_name is None:
        save_name = "./data/glove.model"
    if nr_feature is None:
        nr_feature = 200

    corpus = glove.Corpus()
    print("start fiting sentences...")
    corpus.fit(sentences, window=10)
    gl = glove.Glove(no_components=nr_feature, learning_rate=0.05)
    print("start training glove...")
    gl.fit(corpus.matrix,
           epochs=10,
           no_threads=multiprocessing.cpu_count(),
           verbose=True)
    corpus.save("./data/corpus.model")
    gl.save("./data/glove.model")
Beispiel #10
0
def glove_pro(df_raw,
              sentence_id,
              word_id,
              emb_size=128,
              window=50,
              dropna=False,
              n_jobs=16,
              learning_rate=0.05,
              epoch=8,
              return_model=False):
    """
    conda create -y -n TF1.14 python=3.6 
    pip install glove_python
    ------
    test_glove = datalog.head(10000)
    sentence_id = 'user_id'
    word_id = 'industry'

    res = glove_pro(test_glove, sentence_id, word_id, emb_size=32, 
                  window=20, dropna=False, n_jobs=16, 
                  learning_rate=0.05, 
                  epoch=8,return_model=True)
    res.keys()
    res['sentence_emb_df'].info()
    res['model'].most_similar("6", number=10)

    """
    list_col_nm = f'{sentence_id}__{word_id}_list'
    if (n_jobs is None) or (n_jobs <= 0):
        n_jobs = multiprocessing.cpu_count()
    logger.info(f"========== GloVE: {sentence_id} {word_id} ==========")
    df = df_raw[[sentence_id, word_id]].copy()
    if df[sentence_id].isnull().sum() > 0:
        logger.warning("NaNs exist in sentence_id column!!")
    if dropna:
        df = df.dropna(subset=[sentence_id, word_id])
    else:
        df = df.fillna('NULL_zhangqibot')
    df = df.astype(str)
    tmp = df.groupby(sentence_id,
                     as_index=False)[word_id].agg({list_col_nm: list})
    sentences = tmp[list_col_nm].values.tolist()
    all_words_vocabulary = df[word_id].unique().tolist()
    del tmp[list_col_nm]
    gc.collect()

    matrix = glv.Corpus()
    matrix.fit(corpus=sentences, window=window)
    model = glv.Glove(no_components=emb_size,
                      learning_rate=learning_rate,
                      alpha=0.75,
                      max_count=100,
                      max_loss=10.0,
                      random_state=666)
    model.fit(matrix.matrix, epochs=epoch, no_threads=n_jobs, verbose=1)
    model.add_dictionary(matrix.dictionary)
    # get word embedding matrix
    emb_dict = {}
    for word_i in all_words_vocabulary:
        if word_i in model.dictionary:
            emb_dict[word_i] = model.word_vectors[model.dictionary[word_i]]
        else:
            emb_dict[word_i] = np.zeros(emb_size, dtype="float32")
    return {"word_emb_dict": emb_dict}
Beispiel #11
0
                (w, v.index) for w, v in model.vocab.iteritems())
            model.id2word = utils.revdict(model.word2id)
            model.word_vectors = model.syn0norm
            utils.pickle(model, outf('w2v'))

    if 'glove' in program:
        if os.path.exists(outf('glove')):
            logger.info("glove model found, loading")
            model = utils.unpickle(outf('glove'))
        else:
            if os.path.exists(outf('glove_corpus')):
                logger.info("glove corpus matrix found, loading")
                cooccur = utils.unpickle(outf('glove_corpus'))
            else:
                logger.info("glove corpus matrix not found, creating")
                cooccur = glove.Corpus(dictionary=word2id)
                cooccur.fit(corpus(), window=WINDOW)
                utils.pickle(cooccur, outf('glove_corpus'))
            logger.info("glove model not found, creating")
            model = glove.Glove(no_components=DIM, learning_rate=0.05)
            model.fit(cooccur.matrix,
                      epochs=10,
                      no_threads=WORKERS,
                      verbose=True)
            model.add_dictionary(cooccur.dictionary)
            model.word2id = dict((utils.to_unicode(w), id)
                                 for w, id in model.dictionary.iteritems())
            model.id2word = gensim.utils.revdict(model.word2id)
            utils.pickle(model, outf('glove'))

    if 'pmi' in program:
Beispiel #12
0
 def train(self, train_file):
     corpus_model = glove.Corpus()
     corpus_model.fit(self.read_custom_corpus(train_file), window=self.window_size, )
     self.model = DGlove(no_components=self.dimensions, learning_rate=0.05)
     self.model.fit(corpus_model.matrix, no_threads=self.workers, epochs=self.epochs, verbose=True)
     self.model.add_dictionary(corpus_model.dictionary)
# coding: utf-8
import glove
c = glove.Corpus()
c.fit(["dog eat food", "people eat food", "dog eat people"], window=3)
c.fit([["dog", "eat", "food"], ["people eat food", "dog eat people"], window=3)
c
]
c.fit([["dog", "eat", "food"], ["people eat food", "dog eat people"]], window=3)
c
c.matrix
c.fit([["dog", "eat", "food"], ["people", "eat", "food"], ["dog", "eat", "people"]], window=3)
c.matrix
c.matrix[1]
c.matrix.to_array()
c.matrix.toarray()
c.fit([["dog", "eat", "food"], ["people", "eat", "food"], ["dog", "eat", "people"]], window=3)
s = glove.Glove(2, 0.05)
s.fit(c.matrix, epochs = 10, no_threads = 1, verbose = True)
s.add_dictionary(c.dictionary)
c.dictionary
c = glove.Corpus()
c.fit([["dog", "eat", "food"], ["people", "eat", "food"], ["dog", "eat", "people"]], window=3)
c.dictionary
s.fit(c.matrix, epochs = 10, no_threads = 1, verbose = True)
s = glove.Glove(2, 0.05)
s.fit(c.matrix, epochs = 10, no_threads = 1, verbose = True)
s.add_dictionary(c.dictionary)
glove.most_similar('dog')
s.most_similar('dog')
c.matrix.toarray
c.matrix.toarray()
Beispiel #14
0
from gensim import utils, corpora, matutils, models
import glove

# Restrict dictionary to the 30k most common words.
corpus_file_name = ''

wiki = models.word2vec.LineSentence(corpus_file_name)
id2word = corpora.Dictionary(wiki)
id2word.filter_extremes(keep_n=30000)
word2id = dict((word, id) for id, word in id2word.iteritems())

# Filter all wiki documents to contain only those 30k words.
filter_text = lambda text: [word for word in text if word in word2id]
filtered_wiki = lambda: (filter_text(text) for text in wiki)  # generator

# Get the word co-occurrence matrix -- needs lots of RAM!!
cooccur = glove.Corpus()
cooccur.fit(filtered_wiki(), window=10)

# and train GloVe model itself, using 10 epochs
model_glove = glove.Glove(no_components=600, learning_rate=0.05)
model_glove.fit(cooccur.matrix, epochs=10)

model_glove.fit()

model_glove.save('glove_default_30k.model')
Beispiel #15
0
            batch_first=True)

TRG = Field(tokenize=tokenize_de,
            init_token='<sos>',
            eos_token='<eos>',
            lower=True,
            batch_first=True)

train, valid, test = torchtext.datasets.WMT14.splits(exts=('.en', '.de'),
                                                     fields=(SRC, TRG))
length = len(train.examples)
src_sentences = []
trg_sentences = []
for i in range(length):
    src_sentences.append(vars(train.examples[i])['src'])
    trg_sentences.append(vars(train.examples[i])['trg'])

corpus = glove.Corpus()
corpus2 = glove.Corpus()
corpus.fit(src_sentences, window=10)
corpus2.fit(trg_sentences, window=10)

glove = Glove(no_components=512, learning_rate=0.05)

glove.fit(corpus.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus.dictionary)
glove.save('src_glove.model')

glove.fit(corpus2.matrix, epochs=30, no_threads=4, verbose=True)
glove.add_dictionary(corpus2.dictionary)
glove.save('trg_glove.model')