def preprocess(self, docs=None): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory.""" assert (isinstance(docs, list)), ("input list of documents") assert (all(isinstance(doc, unicode) for doc in docs)),("expected unicode, got string") self.corpus = Corpus() tokens, self.vocab = preprocess.tokenize(docs, self.max_length, merge=False,n_threads=4) # Make a ranked list of rare vs frequent words self.corpus.update_word_count(tokens) self.corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = self.corpus.to_compact(tokens) # Remove extremely rare words pruned = self.corpus.filter_count(compact, min_count=0) # Convert the compactified arrays into bag of words arrays bow = self.corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = self.corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips # and OoV words self.doc_ids = np.arange(pruned.shape[0]) self.flattened, (self.doc_ids,) = self.corpus.compact_to_flat(pruned, self.doc_ids) self.vectors, s, f = self.corpus.compact_word_vectors(self.vocab, model = self.word2vec_model) # vectors = np.delete(vectors,77743,0) # Model Parameters # Number of documents self.n_docs = len(docs) #doc_ids.max() + 1 # Number of unique words in the vocabulary self.n_vocab=self.flattened.max() + 1 doc_idx, lengths = np.unique(self.doc_ids, return_counts=True) self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32') self.doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(self.flattened, return_counts=True) self.term_frequency = np.zeros(self.n_vocab, dtype='int32') self.term_frequency[tok_idx] = freq self.fraction = self.batchsize * 1.0 / self.flattened.shape[0] # Get the string representation for every compact key self.words = self.corpus.word_list(self.vocab)[:self.n_vocab]
def make_corpus(tokens, min_count=50): """ Creates LDA2vec corpus :param text: :return: """ corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() compact = corpus.to_compact(tokens) pruned = corpus.filter_count(compact, min_count=min_count) clean = corpus.subsample_frequent(pruned) doc_ids = np.arange(pruned.shape[0]) corpus, flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids) return corpus, flattened, doc_ids, clean
def make_corpus(tokens, min_count=50): """ Creates LDA2vec corpus :param text: :return: """ corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() compact = corpus.to_compact(tokens) pruned = corpus.filter_count(compact, min_count=min_count) clean = corpus.subsample_frequent(pruned) doc_ids = np.arange(pruned.shape[0]) corpus, flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids) return corpus, flattened, doc_ids, clean
bad = set(["ax>", '`@("', '---', '===', '^^^']) def clean(line): return ' '.join(w for w in line.split() if not any(t in w for t in bad)) # Preprocess data max_length = 10000 # Limit of 10k words per document # Convert to unicode (spaCy only works with unicode) texts = [unicode(clean(d)) for d in texts] tokens, vocab = preprocess.tokenize(texts, max_length, merge=False, n_threads=4) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=30) # Convert the compactified arrays into bag of words arrays bow = corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position
# Optional: moving the model to the GPU makes it ~10x faster # set to False if you're having problems with Chainer and CUDA gpu = cuda.available logging.basicConfig() # Fetch data texts = fetch_20newsgroups(subset="train").data # Convert to unicode (spaCy only works with unicode) texts = [unicode(d) for d in texts] # Preprocess data max_length = 1000 # Limit of 1k words per document tokens, vocab = preprocess.tokenize(texts, max_length, tag=False, parse=False, entity=False) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=5) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips # and OoV words
def text_prep(text_doc, count): text_doc = [digit_removal(d) for d in text_doc] text_doc = [stop_word_removal(d) for d in text_doc] maxlength_doc = 80000 text_doc = [unicode(clean(d)) for d in text_doc] print(text_doc) tokens, vocab = preprocess.tokenize(text_doc, maxlength_doc, merge=False, n_threads=4) print(tokens, vocab) corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() compact = corpus.to_compact(tokens) pruned = corpus.filter_count(compact, min_count=50) bow = corpus.compact_to_bow(pruned) clean_data = corpus.subsample_frequent(pruned) doc_ids = numpy.arange(pruned.shape[0]) flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids) assert flattened.min() >= 0 pickle.dump( vocab, open(os.path.join(result_folder, str(count) + 'vocab.pkl'), 'w')) pickle.dump( corpus, open(os.path.join(result_folder, str(count) + 'corpus.pkl'), 'w')) numpy.save("flattened", flattened) numpy.save("doc_ids", doc_ids) numpy.save("pruned", pruned) numpy.save("bow", bow)
if len(readToken[i]) < max_length: for k in range(len(readToken[i]), max_length): array[i][k] = -2 tokens = array """ arr : 2D array of ints Has shape (len(texts), max_length). Each value represents the word index. vocab : dict Keys are the word index, and values are the string. The pad index gets mapped to None """ # Make a ranked list of rare vs frequent words corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=10) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) print "n_words", np.unique(clean).max() story_id = np.array(list(range(0, lineno)), int)
class Lda2VecFeaturizer: def __init__(self,\ clambda=200,\ n_topics=10,\ batchsize=4096,\ power=0.75,\ words_pretrained=True,\ temperature=1,\ max_length=1000,\ min_count=0,\ word2vec_path=None): # 'Strength' of the dircihlet prior; 200.0 seems to work well self.clambda = clambda # Number of topics to fit self.n_topics = n_topics #int(os.getenv('n_topics', 10)) self.batchsize = batchsize # Power for neg sampling self.power = power #float(os.getenv('power', 0.75)) # Intialize with pretrained word vectors self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True))) self.temp = temperature self.max_length = max_length self.min_count = min_count self.word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True) def preprocess(self, docs=None): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory.""" assert (isinstance(docs, list)), ("input list of documents") assert (all(isinstance(doc, unicode) for doc in docs)),("expected unicode, got string") self.corpus = Corpus() tokens, self.vocab = preprocess.tokenize(docs, self.max_length, merge=False,n_threads=4) # Make a ranked list of rare vs frequent words self.corpus.update_word_count(tokens) self.corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = self.corpus.to_compact(tokens) # Remove extremely rare words pruned = self.corpus.filter_count(compact, min_count=0) # Convert the compactified arrays into bag of words arrays bow = self.corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = self.corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips # and OoV words self.doc_ids = np.arange(pruned.shape[0]) self.flattened, (self.doc_ids,) = self.corpus.compact_to_flat(pruned, self.doc_ids) self.vectors, s, f = self.corpus.compact_word_vectors(self.vocab, model = self.word2vec_model) # vectors = np.delete(vectors,77743,0) # Model Parameters # Number of documents self.n_docs = len(docs) #doc_ids.max() + 1 # Number of unique words in the vocabulary self.n_vocab=self.flattened.max() + 1 doc_idx, lengths = np.unique(self.doc_ids, return_counts=True) self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32') self.doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(self.flattened, return_counts=True) self.term_frequency = np.zeros(self.n_vocab, dtype='int32') self.term_frequency[tok_idx] = freq self.fraction = self.batchsize * 1.0 / self.flattened.shape[0] # Get the string representation for every compact key self.words = self.corpus.word_list(self.vocab)[:self.n_vocab] def train(self,docs=None, epochs=200, update_words=False, update_topics=True): """ Takes the training documents as a list of documents preprocesses the documents and reurns a dictionary data containing the Topic distribution Vocab document length and topic word distribution""" texts = docs docs = [] for text in texts: docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab))) logging.info("preprocessing...") self.preprocess(docs) logging.info('preprocessed!') self.train_model = LDA2Vec(n_documents=self.n_docs,\ n_document_topics=self.n_topics,\ n_units=300,\ n_vocab=self.n_vocab,\ counts=self.term_frequency,\ n_samples=15,\ power=self.power,\ temperature=self.temp) if self.words_pretrained: self.train_model.sampler.W.data = self.vectors[:self.n_vocab, :] optimizer = O.Adam() optimizer.setup(self.train_model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 msgs = defaultdict(list) for epoch in range(epochs): print "epoch : ",epoch data = prepare_topics(cuda.to_cpu(self.train_model.mixture.weights.W.data).copy(), cuda.to_cpu(self.train_model.mixture.factors.W.data).copy(), cuda.to_cpu(self.train_model.sampler.W.data).copy(), self.words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(self.n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) #progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency #np.savez('topics.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): t0 = time.time() optimizer.zero_grads() l = self.train_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics) prior = self.train_model.prior() loss = prior * self.fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = self.batchsize / dt msgs["E"].append(epoch) msgs["L"].append(float(l)) j += 1 logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate) print msg.format(**logs) print "\n ================================= \n" #serializers.save_hdf5("lda2vec.hdf5", self.model) msgs["loss_per_epoch"].append(float(l)) return data, msgs def initialize_infer(self,\ clambda=200,\ batchsize=4096,\ power=0.75,\ words_pretrained=True,\ temperature=1,\ max_length=1000,\ min_count=0): """ Initializes parameters for testing, if needed Usually not called. """ # 'Strength' of the dircihlet prior; 200.0 seems to work well self.clambda = clambda # Number of topics to fit self.batchsize = batchsize # Power for neg sampling self.power = power #float(os.getenv('power', 0.75)) # Intialize with pretrained word vectors self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True))) self.temp = temperature self.max_length = max_length self.min_count = min_count logging.info('Test parameters initialized!') def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None): """ Infers the featurs of a new document that is passed in. By running the Lda2vec algorithm again. But by updating only the topic distributions""" texts = docs docs = [] for text in texts: docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab))) logging.info("preprocessing") self.preprocess(docs) logging.info('preprocessed!') self.infer_model = LDA2Vec(n_documents=self.n_docs,\ n_document_topics=self.n_topics,\ n_units=300,\ n_vocab=self.n_vocab,\ counts=self.term_frequency,\ n_samples=15,\ power=self.power,\ temperature=self.temp) if self.words_pretrained: self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :] self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data if topic_vectors is not None: assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match") self.infer_model.mixture.factors.W.data = topic_vectors optimizer = O.Adam() optimizer.setup(self.infer_model) clip = chainer.optimizer.GradientClipping(5.0) optimizer.add_hook(clip) j = 0 msgs = defaultdict(list) for epoch in range(epochs): print "epoch : ",epoch data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(), cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(), cuda.to_cpu(self.infer_model.sampler.W.data).copy(), self.words) top_words = print_top_words_per_topic(data) if j % 100 == 0 and j > 100: coherence = topic_coherence(top_words) for j in range(self.n_topics): print j, coherence[(j, 'cv')] kw = dict(top_words=top_words, coherence=coherence, epoch=epoch) #progress[str(epoch)] = pickle.dumps(kw) data['doc_lengths'] = self.doc_lengths data['term_frequency'] = self.term_frequency #np.savez('topics.pyldavis', **data) for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened): t0 = time.time() optimizer.zero_grads() l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics) prior = self.infer_model.prior() loss = prior * self.fraction loss.backward() optimizer.update() msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} " "P:{prior:1.3e} R:{rate:1.3e}") prior.to_cpu() loss.to_cpu() t1 = time.time() dt = t1 - t0 rate = self.batchsize / dt msgs["E"].append(epoch) msgs["L"].append(float(l)) j += 1 logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate) print msg.format(**logs) print "\n ================================= \n" #serializers.save_hdf5("lda2vec.hdf5", self.model) msgs["loss_per_epoch"].append(float(l)) return data, msgs
def process_data(tokens, vocab, model): """ preprocessing of the data by counting word occurrences and filtering according to these. The most frequent words are subsampled, and cleans the vocabulary words according to the word2vec models vocabulary :param tokens: spacy tokens :param vocab: spacy vocabulary :param model: word2vec model name :return: """ corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=15) # Convert the compactified arrays into bag of words arrays bow = corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips # and OoV words doc_ids = np.arange(pruned.shape[0]) flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids) assert flattened.min() >= 0 # Fill in the pretrained word vectors #n_dim = 300 fn_wordvc = model print("starts to compact word vectors") vectors, s, f = corpus.compact_word_vectors(vocab, filename=fn_wordvc) print("done with compact word vectors") # Save all of the preprocessed files print("now saving files") pickle.dump(vocab, open('vocab_' + id + '.pkl', 'w')) pickle.dump(corpus, open('corpus_' + id + '.pkl', 'w')) np.save('flattened_' + id, flattened) np.save('doc_ids_' + id, doc_ids) np.save('pruned_' + id, pruned) #np.save('bow_'+id, bow) Does not seem to be neccessary for lda2vec_run.py np.save('vectors_' + id, vectors)
def clean(line): return ' '.join(w for w in line.split() if not any(t in w for t in bad)) # Preprocess data max_length = 10000 # Limit of 10k words per document # Convert to unicode (spaCy only works with unicode) if mode =='chinese': texts = [clean_chinese(d) for d in texts] else: texts = [unicode(clean(d)) for d in texts] data, vocab = preprocess.tokenize(texts, max_length, merge=False, n_threads=4) n_words = len(vocab) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(data) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(data) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=10) # Convert the compactified arrays into bag of words arrays #bow = corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words #clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position
#bad = set(["ax>", '`@("', '---', '===', '^^^']) bad = stop def clean(line): return ' '.join(w for w in line.split() if not any(t in w for t in bad)) # Preprocess data max_length = 1000 # Limit of 10k words per document # Convert to unicode (spaCy only works with unicode) texts = [unicode(clean(str(d)),errors='ignore') for d in texts] #shivang tokens, vocab = preprocess.tokenize(texts, max_length, merge=False, n_threads=4) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=5) # Convert the compactified arrays into bag of words arrays bow = corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position
remove = ('headers', 'footers', 'quotes') texts = fetch_20newsgroups(subset='train', remove=remove).data # Remove tokens with these substrings bad = set(["ax>", '`@("', '---', '===', '^^^']) def clean(line): return ' '.join(w for w in line.split() if not any(t in w for t in bad)) # Preprocess data max_length = 10000 # Limit of 10k words per document # Convert to unicode (spaCy only works with unicode) texts = [unicode(clean(d)) for d in texts] tokens, vocab = preprocess.tokenize(texts, max_length, merge=False, n_threads=4) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=30) # Convert the compactified arrays into bag of words arrays bow = corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position
import numpy as np import os.path import logging logging.basicConfig() # Fetch data texts = fetch_20newsgroups(subset='train').data # Convert to unicode (spaCy only works with unicode) texts = [unicode(d) for d in texts] # Preprocess data max_length = 10000 # Limit of 10k words per document tokens, vocab = preprocess.tokenize(texts, max_length, tag=False, parse=False, entity=False) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=5) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips # and OoV words
for col, dtype in zip(features.columns, features.dtypes): if dtype is np.dtype('int64'): features[col] = features[col].astype('int32') # Tokenize the texts # If this fails it's likely spacy. Install a recent spacy version. # Only the most recent versions have tokenization of noun phrases # I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a32ff # Also try running python -m spacy.en.download all --force texts = features.pop('comment_text').values tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4, merge=True) del texts # Make a ranked list of rare vs frequent words corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=10) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) print "n_words", np.unique(clean).max() # Extract numpy arrays over the fields we want covered by topics
sm_corpus.append(record['Sentences']) tokens, vocab = preprocess.tokenize(sm_corpus, max_length, n_threads=4, merge=False) del sm_corpus np.save("preprocessed/tokens.npy", tokens) np.save("preprocessed/vocab.npy", vocab) if __name__ == '__main__': # preprocessing_save() tokens = np.load("preprocessed/tokens.npy.npy") vocab = np.load("preprocessed/vocab.npy.npy") # Make a ranked list of rare vs frequent words corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=10) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) print "n_words", np.unique(clean).max() # # Extract numpy arrays over the fields we want covered by topics # # Convert to categorical variables