def preprocess(self, docs=None):
        """ Uses spaCy to quickly tokenize text and return an array
    of indices.
    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory."""

        assert (isinstance(docs, list)), ("input list of documents")
        assert (all(isinstance(doc, unicode) for doc in docs)),("expected unicode, got string")
        
        self.corpus = Corpus()
        
        tokens, self.vocab = preprocess.tokenize(docs, self.max_length, merge=False,n_threads=4)
        
        # Make a ranked list of rare vs frequent words
        self.corpus.update_word_count(tokens)
        self.corpus.finalize()
        # The tokenization uses spaCy indices, and so may have gaps
        # between indices for words that aren't present in our dataset.
        # This builds a new compact index
        compact = self.corpus.to_compact(tokens)
        # Remove extremely rare words
        pruned = self.corpus.filter_count(compact, min_count=0)
        # Convert the compactified arrays into bag of words arrays
        bow = self.corpus.compact_to_bow(pruned)
        # Words tend to have power law frequency, so selectively
        # downsample the most prevalent words
        clean = self.corpus.subsample_frequent(pruned)
        # Now flatten a 2D array of document per row and word position
        # per column to a 1D array of words. This will also remove skips
        # and OoV words
        self.doc_ids = np.arange(pruned.shape[0])
        self.flattened, (self.doc_ids,) = self.corpus.compact_to_flat(pruned, self.doc_ids)

        self.vectors, s, f = self.corpus.compact_word_vectors(self.vocab, model = self.word2vec_model)
        # vectors = np.delete(vectors,77743,0)
        # Model Parameters
        # Number of documents
        self.n_docs = len(docs) #doc_ids.max() + 1
        # Number of unique words in the vocabulary
        self.n_vocab=self.flattened.max()  + 1

        doc_idx, lengths = np.unique(self.doc_ids, return_counts=True)
        self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32')
        self.doc_lengths[doc_idx] = lengths
        # Count all token frequencies
        tok_idx, freq = np.unique(self.flattened, return_counts=True)
        self.term_frequency = np.zeros(self.n_vocab, dtype='int32')
        self.term_frequency[tok_idx] = freq

        self.fraction = self.batchsize * 1.0 / self.flattened.shape[0]

        # Get the string representation for every compact key
        self.words = self.corpus.word_list(self.vocab)[:self.n_vocab]
Exemple #2
0
def make_corpus(tokens, min_count=50):
    """ Creates LDA2vec corpus
    :param text:
    :return:
    """
    corpus = Corpus()
    corpus.update_word_count(tokens)
    corpus.finalize()

    compact = corpus.to_compact(tokens)

    pruned = corpus.filter_count(compact, min_count=min_count)
    clean = corpus.subsample_frequent(pruned)
    doc_ids = np.arange(pruned.shape[0])
    corpus, flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids)
    return corpus, flattened, doc_ids, clean
Exemple #3
0
def make_corpus(tokens, min_count=50):
    """ Creates LDA2vec corpus
    :param text:
    :return:
    """
    corpus = Corpus()
    corpus.update_word_count(tokens)
    corpus.finalize()

    compact = corpus.to_compact(tokens)

    pruned = corpus.filter_count(compact, min_count=min_count)
    clean = corpus.subsample_frequent(pruned)
    doc_ids = np.arange(pruned.shape[0])
    corpus, flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)
    return corpus, flattened, doc_ids, clean
Exemple #4
0
bad = set(["ax>", '`@("', '---', '===', '^^^'])


def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))


# Preprocess data
max_length = 10000  # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(clean(d)) for d in texts]
tokens, vocab = preprocess.tokenize(texts,
                                    max_length,
                                    merge=False,
                                    n_threads=4)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=30)
# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
Exemple #5
0
# Optional: moving the model to the GPU makes it ~10x faster
# set to False if you're having problems with Chainer and CUDA
gpu = cuda.available

logging.basicConfig()

# Fetch data
texts = fetch_20newsgroups(subset="train").data
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(d) for d in texts]

# Preprocess data
max_length = 1000  # Limit of 1k words per document
tokens, vocab = preprocess.tokenize(texts, max_length, tag=False, parse=False, entity=False)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=5)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
# and OoV words
def text_prep(text_doc, count):
    text_doc = [digit_removal(d) for d in text_doc]
    text_doc = [stop_word_removal(d) for d in text_doc]
    maxlength_doc = 80000
    text_doc = [unicode(clean(d)) for d in text_doc]
    print(text_doc)
    tokens, vocab = preprocess.tokenize(text_doc,
                                        maxlength_doc,
                                        merge=False,
                                        n_threads=4)
    print(tokens, vocab)
    corpus = Corpus()
    corpus.update_word_count(tokens)
    corpus.finalize()
    compact = corpus.to_compact(tokens)
    pruned = corpus.filter_count(compact, min_count=50)
    bow = corpus.compact_to_bow(pruned)
    clean_data = corpus.subsample_frequent(pruned)
    doc_ids = numpy.arange(pruned.shape[0])
    flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids)
    assert flattened.min() >= 0
    pickle.dump(
        vocab, open(os.path.join(result_folder,
                                 str(count) + 'vocab.pkl'), 'w'))
    pickle.dump(
        corpus,
        open(os.path.join(result_folder,
                          str(count) + 'corpus.pkl'), 'w'))
    numpy.save("flattened", flattened)
    numpy.save("doc_ids", doc_ids)
    numpy.save("pruned", pruned)
    numpy.save("bow", bow)
Exemple #7
0
		if len(readToken[i]) < max_length:
			for k in range(len(readToken[i]), max_length):
				array[i][k] = -2
	tokens = array

	"""
	    arr : 2D array of ints
	        Has shape (len(texts), max_length). Each value represents
	        the word index.
	    vocab : dict
	        Keys are the word index, and values are the string. The pad index gets
	        mapped to None
	"""

	# Make a ranked list of rare vs frequent words
	corpus = Corpus()
	corpus.update_word_count(tokens)
	corpus.finalize()

	# The tokenization uses spaCy indices, and so may have gaps
	# between indices for words that aren't present in our dataset.
	# This builds a new compact index
	compact = corpus.to_compact(tokens)
	# Remove extremely rare words
	pruned = corpus.filter_count(compact, min_count=10)
	# Words tend to have power law frequency, so selectively
	# downsample the most prevalent words
	clean = corpus.subsample_frequent(pruned)
	print "n_words", np.unique(clean).max()

	story_id = np.array(list(range(0, lineno)), int)
class Lda2VecFeaturizer:
    def __init__(self,\
                clambda=200,\
                n_topics=10,\
                batchsize=4096,\
                power=0.75,\
                words_pretrained=True,\
                temperature=1,\
                max_length=1000,\
                min_count=0,\
                word2vec_path=None):
        
        # 'Strength' of the dircihlet prior; 200.0 seems to work well
        self.clambda = clambda
        # Number of topics to fit
        self.n_topics = n_topics #int(os.getenv('n_topics', 10))
        self.batchsize = batchsize
        # Power for neg sampling
        self.power = power #float(os.getenv('power', 0.75))
        # Intialize with pretrained word vectors
        self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True)))
        self.temp = temperature
        self.max_length = max_length
        self.min_count = min_count
        self.word2vec_model = KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

    def preprocess(self, docs=None):
        """ Uses spaCy to quickly tokenize text and return an array
    of indices.
    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory."""

        assert (isinstance(docs, list)), ("input list of documents")
        assert (all(isinstance(doc, unicode) for doc in docs)),("expected unicode, got string")
        
        self.corpus = Corpus()
        
        tokens, self.vocab = preprocess.tokenize(docs, self.max_length, merge=False,n_threads=4)
        
        # Make a ranked list of rare vs frequent words
        self.corpus.update_word_count(tokens)
        self.corpus.finalize()
        # The tokenization uses spaCy indices, and so may have gaps
        # between indices for words that aren't present in our dataset.
        # This builds a new compact index
        compact = self.corpus.to_compact(tokens)
        # Remove extremely rare words
        pruned = self.corpus.filter_count(compact, min_count=0)
        # Convert the compactified arrays into bag of words arrays
        bow = self.corpus.compact_to_bow(pruned)
        # Words tend to have power law frequency, so selectively
        # downsample the most prevalent words
        clean = self.corpus.subsample_frequent(pruned)
        # Now flatten a 2D array of document per row and word position
        # per column to a 1D array of words. This will also remove skips
        # and OoV words
        self.doc_ids = np.arange(pruned.shape[0])
        self.flattened, (self.doc_ids,) = self.corpus.compact_to_flat(pruned, self.doc_ids)

        self.vectors, s, f = self.corpus.compact_word_vectors(self.vocab, model = self.word2vec_model)
        # vectors = np.delete(vectors,77743,0)
        # Model Parameters
        # Number of documents
        self.n_docs = len(docs) #doc_ids.max() + 1
        # Number of unique words in the vocabulary
        self.n_vocab=self.flattened.max()  + 1

        doc_idx, lengths = np.unique(self.doc_ids, return_counts=True)
        self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32')
        self.doc_lengths[doc_idx] = lengths
        # Count all token frequencies
        tok_idx, freq = np.unique(self.flattened, return_counts=True)
        self.term_frequency = np.zeros(self.n_vocab, dtype='int32')
        self.term_frequency[tok_idx] = freq

        self.fraction = self.batchsize * 1.0 / self.flattened.shape[0]

        # Get the string representation for every compact key
        self.words = self.corpus.word_list(self.vocab)[:self.n_vocab]


        
    def train(self,docs=None, epochs=200, update_words=False, update_topics=True):
        """  Takes the training documents as a list of documents
        preprocesses the documents and reurns a dictionary data containing the
        Topic distribution
        Vocab
        document length
        and topic word distribution"""

        texts = docs
        docs = []
        for text in texts:
            docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab)))

        logging.info("preprocessing...")
        self.preprocess(docs)
        logging.info('preprocessed!')
        
        self.train_model = LDA2Vec(n_documents=self.n_docs,\
                        n_document_topics=self.n_topics,\
                        n_units=300,\
                        n_vocab=self.n_vocab,\
                        counts=self.term_frequency,\
                        n_samples=15,\
                        power=self.power,\
                        temperature=self.temp)
        
        
        
        if self.words_pretrained:
            self.train_model.sampler.W.data = self.vectors[:self.n_vocab, :]



        optimizer = O.Adam()
        optimizer.setup(self.train_model)
        clip = chainer.optimizer.GradientClipping(5.0)
        optimizer.add_hook(clip)
        
        
        
        j = 0
        msgs = defaultdict(list)
        
        for epoch in range(epochs):
            print "epoch : ",epoch
            data = prepare_topics(cuda.to_cpu(self.train_model.mixture.weights.W.data).copy(),
                                  cuda.to_cpu(self.train_model.mixture.factors.W.data).copy(),
                                  cuda.to_cpu(self.train_model.sampler.W.data).copy(),
                                  self.words)
            top_words = print_top_words_per_topic(data)
            if j % 100 == 0 and j > 100:
                coherence = topic_coherence(top_words)
                for j in range(self.n_topics):
                    print j, coherence[(j, 'cv')]
                kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                #progress[str(epoch)] = pickle.dumps(kw)
            data['doc_lengths'] = self.doc_lengths
            data['term_frequency'] = self.term_frequency
            #np.savez('topics.pyldavis', **data)
            for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
                t0 = time.time()
                optimizer.zero_grads()
                l = self.train_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics)
                prior = self.train_model.prior()
                loss = prior * self.fraction
                loss.backward()
                optimizer.update()
                msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
                       "P:{prior:1.3e} R:{rate:1.3e}")
                prior.to_cpu()
                loss.to_cpu()
                t1 = time.time()
                dt = t1 - t0
                rate = self.batchsize / dt
                
                
                msgs["E"].append(epoch)
                msgs["L"].append(float(l))

                j += 1
            logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate)
            print msg.format(**logs)
            print "\n ================================= \n"
            #serializers.save_hdf5("lda2vec.hdf5", self.model)
            msgs["loss_per_epoch"].append(float(l))
        return data, msgs

    
    def initialize_infer(self,\
                clambda=200,\
                batchsize=4096,\
                power=0.75,\
                words_pretrained=True,\
                temperature=1,\
                max_length=1000,\
                min_count=0):
    """ Initializes parameters for testing, if needed
    Usually not called. """
        
        # 'Strength' of the dircihlet prior; 200.0 seems to work well
        self.clambda = clambda
        # Number of topics to fit
        self.batchsize = batchsize
        # Power for neg sampling
        self.power = power #float(os.getenv('power', 0.75))
        # Intialize with pretrained word vectors
        self.words_pretrained = words_pretrained #bool(int(os.getenv('pretrained', True)))
        self.temp = temperature
        self.max_length = max_length
        self.min_count = min_count

        logging.info('Test parameters initialized!')

    def infer(self,docs=None,epochs=200, update_words=False, update_topics=False, topic_vectors=None):
        """ Infers the featurs of a new document that is passed in.
         By running the Lda2vec algorithm again.
        But by updating only the topic distributions"""

        texts = docs
        docs = []
        for text in texts:
            docs.append(unicode(" ".join(word for word in text.split() if word in self.word2vec_model.vocab)))

        logging.info("preprocessing")
        
        self.preprocess(docs)
        
        logging.info('preprocessed!')
        
        self.infer_model = LDA2Vec(n_documents=self.n_docs,\
                        n_document_topics=self.n_topics,\
                        n_units=300,\
                        n_vocab=self.n_vocab,\
                        counts=self.term_frequency,\
                        n_samples=15,\
                        power=self.power,\
                        temperature=self.temp)
        
        
        if self.words_pretrained:
            self.infer_model.sampler.W.data = self.vectors[:self.n_vocab, :]

        self.infer_model.mixture.factors.W.data = self.train_model.mixture.factors.W.data
        if topic_vectors is not None:
            assert(topic_vectors.shape==self.infer_model.mixture.factors.W.data.shape), ("topic vectors shape doesn't match")
            self.infer_model.mixture.factors.W.data = topic_vectors


        optimizer = O.Adam()
        optimizer.setup(self.infer_model)
        clip = chainer.optimizer.GradientClipping(5.0)
        optimizer.add_hook(clip)
        
        
        
        j = 0
        msgs = defaultdict(list)
        for epoch in range(epochs):
            print "epoch : ",epoch
            data = prepare_topics(cuda.to_cpu(self.infer_model.mixture.weights.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.mixture.factors.W.data).copy(),
                                  cuda.to_cpu(self.infer_model.sampler.W.data).copy(),
                                  self.words)
            top_words = print_top_words_per_topic(data)
            if j % 100 == 0 and j > 100:
                coherence = topic_coherence(top_words)
                for j in range(self.n_topics):
                    print j, coherence[(j, 'cv')]
                kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)
                #progress[str(epoch)] = pickle.dumps(kw)
            data['doc_lengths'] = self.doc_lengths
            data['term_frequency'] = self.term_frequency
            #np.savez('topics.pyldavis', **data)
            for d, f in utils.chunks(self.batchsize, self.doc_ids, self.flattened):
                t0 = time.time()
                optimizer.zero_grads()
                l = self.infer_model.fit_partial(d.copy(), f.copy(), update_words=update_words, update_topics=update_topics)
                prior = self.infer_model.prior()
                loss = prior * self.fraction
                loss.backward()
                optimizer.update()
                msg = ("J:{j:05d} E:{epoch:05d} L:{loss:1.3e} "
                       "P:{prior:1.3e} R:{rate:1.3e}")
                prior.to_cpu()
                loss.to_cpu()
                t1 = time.time()
                dt = t1 - t0
                rate = self.batchsize / dt
                
                

                msgs["E"].append(epoch)
                msgs["L"].append(float(l))

                
                j += 1
            logs = dict(loss=float(l), epoch=epoch, j=j, prior=float(prior.data), rate=rate)
            print msg.format(**logs)
            print "\n ================================= \n"
            #serializers.save_hdf5("lda2vec.hdf5", self.model)
            msgs["loss_per_epoch"].append(float(l))
        return data, msgs
Exemple #9
0
def process_data(tokens, vocab, model):
    """
    preprocessing of the data by counting word occurrences and filtering according to these.
    The most frequent words are subsampled, and cleans the vocabulary words according to the
    word2vec models vocabulary

    :param tokens: spacy tokens
    :param vocab: spacy vocabulary
    :param model: word2vec model name
    :return:
    """
    corpus = Corpus()
    # Make a ranked list of rare vs frequent words
    corpus.update_word_count(tokens)
    corpus.finalize()
    # The tokenization uses spaCy indices, and so may have gaps
    # between indices for words that aren't present in our dataset.
    # This builds a new compact index
    compact = corpus.to_compact(tokens)
    # Remove extremely rare words
    pruned = corpus.filter_count(compact, min_count=15)
    # Convert the compactified arrays into bag of words arrays
    bow = corpus.compact_to_bow(pruned)
    # Words tend to have power law frequency, so selectively
    # downsample the most prevalent words
    clean = corpus.subsample_frequent(pruned)
    # Now flatten a 2D array of document per row and word position
    # per column to a 1D array of words. This will also remove skips
    # and OoV words
    doc_ids = np.arange(pruned.shape[0])
    flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids)
    assert flattened.min() >= 0
    # Fill in the pretrained word vectors
    #n_dim = 300
    fn_wordvc = model
    print("starts to compact word vectors")
    vectors, s, f = corpus.compact_word_vectors(vocab, filename=fn_wordvc)
    print("done with compact word vectors")
    # Save all of the preprocessed files
    print("now saving files")
    pickle.dump(vocab, open('vocab_' + id + '.pkl', 'w'))
    pickle.dump(corpus, open('corpus_' + id + '.pkl', 'w'))
    np.save('flattened_' + id, flattened)
    np.save('doc_ids_' + id, doc_ids)
    np.save('pruned_' + id, pruned)
    #np.save('bow_'+id, bow) Does not seem to be neccessary for lda2vec_run.py
    np.save('vectors_' + id, vectors)
Exemple #10
0
def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))

# Preprocess data
max_length = 10000   # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
if mode =='chinese':
    texts = [clean_chinese(d) for d in texts]
else:
    texts = [unicode(clean(d)) for d in texts]


data, vocab = preprocess.tokenize(texts, max_length, merge=False,
                                    n_threads=4)
n_words = len(vocab)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(data)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(data)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=10)
# Convert the compactified arrays into bag of words arrays
#bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
#clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
Exemple #11
0
#bad = set(["ax>", '`@("', '---', '===', '^^^'])

bad = stop

def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))

# Preprocess data
max_length = 1000   # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)

texts = [unicode(clean(str(d)),errors='ignore') for d in texts] #shivang

tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,
                                    n_threads=4)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=5)
# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
Exemple #12
0
remove = ('headers', 'footers', 'quotes')
texts = fetch_20newsgroups(subset='train', remove=remove).data
# Remove tokens with these substrings
bad = set(["ax>", '`@("', '---', '===', '^^^'])


def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))

# Preprocess data
max_length = 10000   # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(clean(d)) for d in texts]
tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,
                                    n_threads=4)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=30)
# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
Exemple #13
0
import numpy as np
import os.path
import logging

logging.basicConfig()

# Fetch data
texts = fetch_20newsgroups(subset='train').data
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(d) for d in texts]

# Preprocess data
max_length = 10000   # Limit of 10k words per document
tokens, vocab = preprocess.tokenize(texts, max_length, tag=False,
                                    parse=False, entity=False)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=5)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
# and OoV words
Exemple #14
0
for col, dtype in zip(features.columns, features.dtypes):
    if dtype is np.dtype('int64'):
        features[col] = features[col].astype('int32')

# Tokenize the texts
# If this fails it's likely spacy. Install a recent spacy version.
# Only the most recent versions have tokenization of noun phrases
# I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a32ff
# Also try running python -m spacy.en.download all --force
texts = features.pop('comment_text').values
tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4,
                                    merge=True)
del texts

# Make a ranked list of rare vs frequent words
corpus = Corpus()
corpus.update_word_count(tokens)
corpus.finalize()

# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=10)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
print "n_words", np.unique(clean).max()

# Extract numpy arrays over the fields we want covered by topics
Exemple #15
0
        sm_corpus.append(record['Sentences'])
    tokens, vocab = preprocess.tokenize(sm_corpus,
                                        max_length,
                                        n_threads=4,
                                        merge=False)
    del sm_corpus
    np.save("preprocessed/tokens.npy", tokens)
    np.save("preprocessed/vocab.npy", vocab)


if __name__ == '__main__':
    # preprocessing_save()
    tokens = np.load("preprocessed/tokens.npy.npy")
    vocab = np.load("preprocessed/vocab.npy.npy")
    # Make a ranked list of rare vs frequent words
    corpus = Corpus()
    corpus.update_word_count(tokens)
    corpus.finalize()
    # The tokenization uses spaCy indices, and so may have gaps
    # between indices for words that aren't present in our dataset.
    # This builds a new compact index
    compact = corpus.to_compact(tokens)
    # Remove extremely rare words
    pruned = corpus.filter_count(compact, min_count=10)
    # Words tend to have power law frequency, so selectively
    # downsample the most prevalent words
    clean = corpus.subsample_frequent(pruned)
    print "n_words", np.unique(clean).max()

    # # Extract numpy arrays over the fields we want covered by topics
    # # Convert to categorical variables