def preprocess_spacy_tokenize(data): import lda2vec.preprocess as spacy_preprocess clean_sentences = sentences_without_stopwords(data) tokens, vocab = spacy_preprocess.tokenize(clean_sentences, max_length=10000, merge=False, n_threads=4) return tokens, vocab
def text_prep(text_doc, count): text_doc = [digit_removal(d) for d in text_doc] text_doc = [stop_word_removal(d) for d in text_doc] maxlength_doc = 80000 text_doc = [unicode(clean(d)) for d in text_doc] print(text_doc) tokens, vocab = preprocess.tokenize(text_doc, maxlength_doc, merge=False, n_threads=4) print(tokens, vocab) corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() compact = corpus.to_compact(tokens) pruned = corpus.filter_count(compact, min_count=50) bow = corpus.compact_to_bow(pruned) clean_data = corpus.subsample_frequent(pruned) doc_ids = numpy.arange(pruned.shape[0]) flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids) assert flattened.min() >= 0 pickle.dump( vocab, open(os.path.join(result_folder, str(count) + 'vocab.pkl'), 'w')) pickle.dump( corpus, open(os.path.join(result_folder, str(count) + 'corpus.pkl'), 'w')) numpy.save("flattened", flattened) numpy.save("doc_ids", doc_ids) numpy.save("pruned", pruned) numpy.save("bow", bow)
def main(): docs = get_docs() texts = make_texts(docs, single=False) questions = get_questions() texts.extend(questions) texts = preprocess_text(texts) texts = [t for t in texts if t] tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False) log.info("Got tokens and vocabulary. Vocab size: %d" % len(vocab)) corpus, flat_corpus, doc_ids, clean_set = make_corpus(tokens=tokens, min_count=50) log.info("Got corpus") # Model Parameters # Number of documents n_docs = len(texts) log.info("number of texts: %d" % n_docs) # Number of unique words in the vocabulary n_words = flat_corpus.max() + 1 # Number of dimensions in a single word vector n_hidden = 128 # Number of topics to fit n_topics = 20 # Get the count for each key counts = corpus.keys_counts[:n_words] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_words] log.info("Words: \n %s" % words) # Fit the model log.info("fitting the model") model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2) model.add_categorical_feature(n_docs, n_topics, name='document_id') model.finalize() if os.path.exists('model.hdf5'): serializers.load_hdf5('model.hdf5', model) for _ in range(200): log.info("attempt #%d" % _) model.top_words_per_topic('document_id', words) log.info("TOP_WORDS_PER_TOPIC!\n => ") log.info(model.top_words_per_topic('document_id', words)) log.info('========') model.fit(flat_corpus, categorical_features=[doc_ids], fraction=1e-3, epochs=1) model.to_cpu() serializers.save_hdf5('model.hdf5', model) model.top_words_per_topic('document_id', words)
def preprocess(self, docs=None): """ Uses spaCy to quickly tokenize text and return an array of indices. This method stores a global NLP directory in memory, and takes up to a minute to run for the time. Later calls will have the tokenizer in memory.""" assert (isinstance(docs, list)), ("input list of documents") assert (all(isinstance(doc, unicode) for doc in docs)),("expected unicode, got string") self.corpus = Corpus() tokens, self.vocab = preprocess.tokenize(docs, self.max_length, merge=False,n_threads=4) # Make a ranked list of rare vs frequent words self.corpus.update_word_count(tokens) self.corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = self.corpus.to_compact(tokens) # Remove extremely rare words pruned = self.corpus.filter_count(compact, min_count=0) # Convert the compactified arrays into bag of words arrays bow = self.corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = self.corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips # and OoV words self.doc_ids = np.arange(pruned.shape[0]) self.flattened, (self.doc_ids,) = self.corpus.compact_to_flat(pruned, self.doc_ids) self.vectors, s, f = self.corpus.compact_word_vectors(self.vocab, model = self.word2vec_model) # vectors = np.delete(vectors,77743,0) # Model Parameters # Number of documents self.n_docs = len(docs) #doc_ids.max() + 1 # Number of unique words in the vocabulary self.n_vocab=self.flattened.max() + 1 doc_idx, lengths = np.unique(self.doc_ids, return_counts=True) self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32') self.doc_lengths[doc_idx] = lengths # Count all token frequencies tok_idx, freq = np.unique(self.flattened, return_counts=True) self.term_frequency = np.zeros(self.n_vocab, dtype='int32') self.term_frequency[tok_idx] = freq self.fraction = self.batchsize * 1.0 / self.flattened.shape[0] # Get the string representation for every compact key self.words = self.corpus.word_list(self.vocab)[:self.n_vocab]
def test_tokenize(): texts = [u'Do you recall, not long ago'] texts += [u'We would walk on the sidewalk?'] arr, vocab = preprocess.tokenize(texts, 10) assert arr[0, 0] != arr[0, 1] assert arr.shape[0] == 2 assert arr.shape[1] == 10 assert arr[0, -1] == -2 assert arr.dtype == np.dtype('int32') first_word = texts[0].split(' ')[0].lower() first_lowr = preprocess.nlp.vocab[arr[0, 0]].lower_ assert first_word == first_lowr
def get_tokens(): """ :return: """ docs = get_docs() texts = make_texts(docs, single=False) questions = get_questions() texts.extend(questions) texts = preprocess_text(texts) texts = [t for t in texts if t] tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False) return tokens, vocab
def preprocessing_save(): json_filename = 'data/NoDAPLsmall.json' json_file = open(json_filename) json_str = json_file.read() json_data = json.loads(json_str) sm_corpus = list() for record in json_data: sm_corpus.append(record['Sentences']) tokens, vocab = preprocess.tokenize(sm_corpus, max_length, n_threads=4, merge=False) del sm_corpus np.save("preprocessed/tokens.npy", tokens) np.save("preprocessed/vocab.npy", vocab)
def main(): docs = get_docs() texts = make_texts(docs, single=False) questions = get_questions() texts.extend(questions) texts = preprocess_text(texts) texts = [t for t in texts if t] tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False) log.info("Got tokens and vocabulary. Vocab size: %d" % len(vocab)) corpus, flat_corpus, doc_ids, clean_set = make_corpus(tokens=tokens, min_count=50) log.info("Got corpus") # Model Parameters # Number of documents n_docs = len(texts) log.info("number of texts: %d" % n_docs) # Number of unique words in the vocabulary n_words = flat_corpus.max() + 1 # Number of dimensions in a single word vector n_hidden = 128 # Number of topics to fit n_topics = 20 # Get the count for each key counts = corpus.keys_counts[:n_words] # Get the string representation for every compact key words = corpus.word_list(vocab)[:n_words] log.info("Words: \n %s" % words) # Fit the model log.info("fitting the model") model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2) model.add_categorical_feature(n_docs, n_topics, name="document_id") model.finalize() if os.path.exists("model.hdf5"): serializers.load_hdf5("model.hdf5", model) for _ in range(200): log.info("attempt #%d" % _) model.top_words_per_topic("document_id", words) log.info("TOP_WORDS_PER_TOPIC!\n => ") log.info(model.top_words_per_topic("document_id", words)) log.info("========") model.fit(flat_corpus, categorical_features=[doc_ids], fraction=1e-3, epochs=1) model.to_cpu() serializers.save_hdf5("model.hdf5", model) model.top_words_per_topic("document_id", words)
print x return x sometext = [uniuncode(x) for x in alltext[:100]] # max words grabbed per document max_words = 10000 start = datetime.datetime.now() # convert text to unicode (if not already) # in my case text is already in unicode # tokenize uses spacy under the hood tokens, vocab = preprocess.tokenize(sometext, max_words, merge=False, n_threads=4) print '1. tokens made' # he made a generic corpus based on default dictionary # see documentation corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() print '2. corpus updated' # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset.
features = [] # Convert to unicode (spaCy only works with unicode) features = pd.read_csv(fn, encoding='utf8', nrows=nrows) # Convert all integer arrays to int32 for col, dtype in zip(features.columns, features.dtypes): if dtype is np.dtype('int64'): features[col] = features[col].astype('int32') # Tokenize the texts # If this fails it's likely spacy. Install a recent spacy version. # Only the most recent versions have tokenization of noun phrases # I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a32ff # Also try running python -m spacy.en.download all --force texts = features.pop('comment_text').values tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4, merge=True) del texts # Make a ranked list of rare vs frequent words corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=10) # Words tend to have power law frequency, so selectively # downsample the most prevalent words
# Preprocess data max_length = 10000 # Limit of 10k words per document # Convert to unicode (spaCy only works with unicode) # texts = [unicode(clean(d)) for d in texts] texts = [clean(d) for d in texts] # vocab - dictionary where keys are the loose index, and values are the word string. # (Pdb) len(vocab) -> 74179 # (Pdb) type(vocab) -> <class 'dict'> # # tokens, 2D array, one row per document, columns are long hash# of words in the sequence of # the occuring in the document (max words in one document is 10K) # (Pdb) tokens.shape -> (11314, 10000) tokens, vocab = preprocess.tokenize(texts, max_length, merge=False, attr=LEMMA, n_threads=8) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This will build the 2D array, one row per document, columns are word compact hash# # in the sequence of the occuring in the document (max words in one document is 10K) # # (Pdb) len(tokens) -> 11314 # (Pdb) compact.shape -> (11314, 10000) # (Pdb) compact.max() -> 74179
import pickle from sklearn.datasets import fetch_20newsgroups import numpy as np from lda2vec import preprocess, Corpus logging.basicConfig() start = time.time() # Fetch data remove = ('headers', 'footers', 'quotes') texts = fetch_20newsgroups(subset='train', remove=remove).data # Preprocess data max_length = 1000 # Limit of 1k words per document tokens, vocab = preprocess.tokenize(texts, max_length) print '0. Tokenizing done at %.1fs' % (time.time() - start) #del texts corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) print '1. corpus.update_word_count done at %.1fs' % (time.time() - start) corpus.finalize() print '2. corpus.finalize done at %.1fs' % (time.time() - start) # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) print '3. corpus.to_compact done at %.1fs' % (time.time() - start) # Remove extremely rare words
#np.save('bow_'+id, bow) Does not seem to be neccessary for lda2vec_run.py np.save('vectors_' + id, vectors) id = "honeypot_clean_revised" model = "honeypot_clean_model_revised" #tweets_fn = "../lda2vec/tweets_shuffled_no_links.txt" #with open("sentences_no_stopwords.txt", "rb") as fp: # Unpickling # sentences = pickle.load(fp) sentences_part_1 = np.load("sentences_no_stopwords_revised_1.npy") sentences_part_2 = np.load("sentences_no_stopwords_revised_2.npy") sentences = np.append(sentences_part_1, sentences_part_2) #sentences = np.concatenate(sentences_part_1, sentences_part_2) assert sentences[len(sentences_part_1)] == sentences_part_2[0] sentences = list_all_unicode(sentences) print(sentences[:3]) print(np.shape(sentences)) print("starts to tokenize") tokens, vocab = preprocess.tokenize(sentences, max_length=10000, merge=False, n_threads=4) print("tokenized") process_data(tokens, vocab, model=model) print("finished.")
features = [] # Convert to unicode (spaCy only works with unicode) features = pd.read_csv(fn, encoding='utf8', nrows=nrows) # Convert all integer arrays to int64 for col, dtype in zip(features.columns, features.dtypes): if dtype is np.dtype('int64'): features[col] = features[col].astype('int64') # Tokenize the texts # If this fails it's likely spacy. Install a recent spacy version. # Only the most recent versions have tokenization of noun phrases # I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a64ff # Also try running python -m spacy.en.download all --force texts = features.pop('comment_text').values tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4, merge=True) ## if error due to this line # then try setting 'merge = False' del texts # Make a ranked list of rare vs frequent words corpus = Corpus() corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=10) # Words tend to have power law frequency, so selectively
from sklearn.datasets import fetch_20newsgroups from chainer import serializers import numpy as np import os.path import logging logging.basicConfig() # Fetch data texts = fetch_20newsgroups(subset='train').data # Convert to unicode (spaCy only works with unicode) texts = [unicode(d) for d in texts] # Preprocess data max_length = 10000 # Limit of 10k words per document tokens, vocab = preprocess.tokenize(texts, max_length, tag=False, parse=False, entity=False) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=5) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips
import logging # Optional: moving the model to the GPU makes it ~10x faster # set to False if you're having problems with Chainer and CUDA gpu = cuda.available logging.basicConfig() # Fetch data texts = fetch_20newsgroups(subset="train").data # Convert to unicode (spaCy only works with unicode) texts = [unicode(d) for d in texts] # Preprocess data max_length = 1000 # Limit of 1k words per document tokens, vocab = preprocess.tokenize(texts, max_length, tag=False, parse=False, entity=False) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=5) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned) # Now flatten a 2D array of document per row and word position # per column to a 1D array of words. This will also remove skips
remove = ('headers', 'footers', 'quotes') texts = fetch_20newsgroups(subset='train', remove=remove).data # Remove tokens with these substrings bad = set(["ax>", '`@("', '---', '===', '^^^']) def clean(line): return ' '.join(w for w in line.split() if not any(t in w for t in bad)) # Preprocess data max_length = 10000 # Limit of 10k words per document # Convert to unicode (spaCy only works with unicode) texts = [unicode(clean(d)) for d in texts] tokens, vocab = preprocess.tokenize(texts, max_length, merge=False, n_threads=4) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=30) # Convert the compactified arrays into bag of words arrays bow = corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words
# Fetch data remove = ('headers', 'footers', 'quotes') texts = fetch_20newsgroups(subset='train', remove=remove).data # Remove tokens with these substrings bad = set(["ax>", '`@("', '---', '===', '^^^']) def clean(line): return ' '.join(w for w in line.split() if not any(t in w for t in bad)) # Preprocess data max_length = 10000 # Limit of 10k words per document # Convert to unicode (spaCy only works with unicode) texts = [unicode(clean(d)) for d in texts] tokens, vocab = preprocess.tokenize(texts, max_length, merge=False, n_threads=4) corpus = Corpus() # Make a ranked list of rare vs frequent words corpus.update_word_count(tokens) corpus.finalize() # The tokenization uses spaCy indices, and so may have gaps # between indices for words that aren't present in our dataset. # This builds a new compact index compact = corpus.to_compact(tokens) # Remove extremely rare words pruned = corpus.filter_count(compact, min_count=30) # Convert the compactified arrays into bag of words arrays bow = corpus.compact_to_bow(pruned) # Words tend to have power law frequency, so selectively # downsample the most prevalent words clean = corpus.subsample_frequent(pruned)