Ejemplo n.º 1
0
def preprocess_spacy_tokenize(data):
    import lda2vec.preprocess as spacy_preprocess

    clean_sentences = sentences_without_stopwords(data)

    tokens, vocab = spacy_preprocess.tokenize(clean_sentences, max_length=10000, merge=False, n_threads=4)
    return tokens, vocab
Ejemplo n.º 2
0
def text_prep(text_doc, count):
    text_doc = [digit_removal(d) for d in text_doc]
    text_doc = [stop_word_removal(d) for d in text_doc]
    maxlength_doc = 80000
    text_doc = [unicode(clean(d)) for d in text_doc]
    print(text_doc)
    tokens, vocab = preprocess.tokenize(text_doc,
                                        maxlength_doc,
                                        merge=False,
                                        n_threads=4)
    print(tokens, vocab)
    corpus = Corpus()
    corpus.update_word_count(tokens)
    corpus.finalize()
    compact = corpus.to_compact(tokens)
    pruned = corpus.filter_count(compact, min_count=50)
    bow = corpus.compact_to_bow(pruned)
    clean_data = corpus.subsample_frequent(pruned)
    doc_ids = numpy.arange(pruned.shape[0])
    flattened, (doc_ids, ) = corpus.compact_to_flat(pruned, doc_ids)
    assert flattened.min() >= 0
    pickle.dump(
        vocab, open(os.path.join(result_folder,
                                 str(count) + 'vocab.pkl'), 'w'))
    pickle.dump(
        corpus,
        open(os.path.join(result_folder,
                          str(count) + 'corpus.pkl'), 'w'))
    numpy.save("flattened", flattened)
    numpy.save("doc_ids", doc_ids)
    numpy.save("pruned", pruned)
    numpy.save("bow", bow)
Ejemplo n.º 3
0
def main():

    docs = get_docs()
    texts = make_texts(docs, single=False)
    questions = get_questions()
    texts.extend(questions)
    texts = preprocess_text(texts)
    texts = [t for t in texts if t]

    tokens, vocab = preprocess.tokenize(texts,
                                        7500,
                                        tag=False,
                                        parse=False,
                                        entity=False)
    log.info("Got tokens and vocabulary. Vocab size: %d" % len(vocab))

    corpus, flat_corpus, doc_ids, clean_set = make_corpus(tokens=tokens,
                                                          min_count=50)
    log.info("Got corpus")

    # Model Parameters
    # Number of documents
    n_docs = len(texts)
    log.info("number of texts: %d" % n_docs)
    # Number of unique words in the vocabulary
    n_words = flat_corpus.max() + 1
    # Number of dimensions in a single word vector
    n_hidden = 128
    # Number of topics to fit
    n_topics = 20
    # Get the count for each key
    counts = corpus.keys_counts[:n_words]
    # Get the string representation for every compact key
    words = corpus.word_list(vocab)[:n_words]
    log.info("Words: \n %s" % words)

    # Fit the model
    log.info("fitting the model")
    model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2)
    model.add_categorical_feature(n_docs, n_topics, name='document_id')
    model.finalize()
    if os.path.exists('model.hdf5'):
        serializers.load_hdf5('model.hdf5', model)
    for _ in range(200):
        log.info("attempt #%d" % _)
        model.top_words_per_topic('document_id', words)
        log.info("TOP_WORDS_PER_TOPIC!\n => ")
        log.info(model.top_words_per_topic('document_id', words))
        log.info('========')
        model.fit(flat_corpus,
                  categorical_features=[doc_ids],
                  fraction=1e-3,
                  epochs=1)
        model.to_cpu()
    serializers.save_hdf5('model.hdf5', model)
    model.top_words_per_topic('document_id', words)
Ejemplo n.º 4
0
    def preprocess(self, docs=None):
        """ Uses spaCy to quickly tokenize text and return an array
    of indices.
    This method stores a global NLP directory in memory, and takes
    up to a minute to run for the time. Later calls will have the
    tokenizer in memory."""

        assert (isinstance(docs, list)), ("input list of documents")
        assert (all(isinstance(doc, unicode) for doc in docs)),("expected unicode, got string")
        
        self.corpus = Corpus()
        
        tokens, self.vocab = preprocess.tokenize(docs, self.max_length, merge=False,n_threads=4)
        
        # Make a ranked list of rare vs frequent words
        self.corpus.update_word_count(tokens)
        self.corpus.finalize()
        # The tokenization uses spaCy indices, and so may have gaps
        # between indices for words that aren't present in our dataset.
        # This builds a new compact index
        compact = self.corpus.to_compact(tokens)
        # Remove extremely rare words
        pruned = self.corpus.filter_count(compact, min_count=0)
        # Convert the compactified arrays into bag of words arrays
        bow = self.corpus.compact_to_bow(pruned)
        # Words tend to have power law frequency, so selectively
        # downsample the most prevalent words
        clean = self.corpus.subsample_frequent(pruned)
        # Now flatten a 2D array of document per row and word position
        # per column to a 1D array of words. This will also remove skips
        # and OoV words
        self.doc_ids = np.arange(pruned.shape[0])
        self.flattened, (self.doc_ids,) = self.corpus.compact_to_flat(pruned, self.doc_ids)

        self.vectors, s, f = self.corpus.compact_word_vectors(self.vocab, model = self.word2vec_model)
        # vectors = np.delete(vectors,77743,0)
        # Model Parameters
        # Number of documents
        self.n_docs = len(docs) #doc_ids.max() + 1
        # Number of unique words in the vocabulary
        self.n_vocab=self.flattened.max()  + 1

        doc_idx, lengths = np.unique(self.doc_ids, return_counts=True)
        self.doc_lengths = np.zeros(self.doc_ids.max() + 1, dtype='int32')
        self.doc_lengths[doc_idx] = lengths
        # Count all token frequencies
        tok_idx, freq = np.unique(self.flattened, return_counts=True)
        self.term_frequency = np.zeros(self.n_vocab, dtype='int32')
        self.term_frequency[tok_idx] = freq

        self.fraction = self.batchsize * 1.0 / self.flattened.shape[0]

        # Get the string representation for every compact key
        self.words = self.corpus.word_list(self.vocab)[:self.n_vocab]
Ejemplo n.º 5
0
def test_tokenize():
    texts = [u'Do you recall, not long ago']
    texts += [u'We would walk on the sidewalk?']
    arr, vocab = preprocess.tokenize(texts, 10)
    assert arr[0, 0] != arr[0, 1]
    assert arr.shape[0] == 2
    assert arr.shape[1] == 10
    assert arr[0, -1] == -2
    assert arr.dtype == np.dtype('int32')
    first_word = texts[0].split(' ')[0].lower()
    first_lowr = preprocess.nlp.vocab[arr[0, 0]].lower_
    assert first_word == first_lowr
Ejemplo n.º 6
0
def get_tokens():
    """
    :return:
    """
    docs = get_docs()
    texts = make_texts(docs, single=False)
    questions = get_questions()

    texts.extend(questions)

    texts = preprocess_text(texts)
    texts = [t for t in texts if t]

    tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False)
    return tokens, vocab
Ejemplo n.º 7
0
def preprocessing_save():
    json_filename = 'data/NoDAPLsmall.json'
    json_file = open(json_filename)
    json_str = json_file.read()
    json_data = json.loads(json_str)
    sm_corpus = list()
    for record in json_data:
        sm_corpus.append(record['Sentences'])
    tokens, vocab = preprocess.tokenize(sm_corpus,
                                        max_length,
                                        n_threads=4,
                                        merge=False)
    del sm_corpus
    np.save("preprocessed/tokens.npy", tokens)
    np.save("preprocessed/vocab.npy", vocab)
Ejemplo n.º 8
0
def main():

    docs = get_docs()
    texts = make_texts(docs, single=False)
    questions = get_questions()
    texts.extend(questions)
    texts = preprocess_text(texts)
    texts = [t for t in texts if t]

    tokens, vocab = preprocess.tokenize(texts, 7500, tag=False, parse=False, entity=False)
    log.info("Got tokens and vocabulary. Vocab size: %d" % len(vocab))

    corpus, flat_corpus, doc_ids, clean_set = make_corpus(tokens=tokens, min_count=50)
    log.info("Got corpus")

    # Model Parameters
    # Number of documents
    n_docs = len(texts)
    log.info("number of texts: %d" % n_docs)
    # Number of unique words in the vocabulary
    n_words = flat_corpus.max() + 1
    # Number of dimensions in a single word vector
    n_hidden = 128
    # Number of topics to fit
    n_topics = 20
    # Get the count for each key
    counts = corpus.keys_counts[:n_words]
    # Get the string representation for every compact key
    words = corpus.word_list(vocab)[:n_words]
    log.info("Words: \n %s" % words)

    # Fit the model
    log.info("fitting the model")
    model = LDA2Vec(n_words, n_hidden, counts, dropout_ratio=0.2)
    model.add_categorical_feature(n_docs, n_topics, name="document_id")
    model.finalize()
    if os.path.exists("model.hdf5"):
        serializers.load_hdf5("model.hdf5", model)
    for _ in range(200):
        log.info("attempt #%d" % _)
        model.top_words_per_topic("document_id", words)
        log.info("TOP_WORDS_PER_TOPIC!\n => ")
        log.info(model.top_words_per_topic("document_id", words))
        log.info("========")
        model.fit(flat_corpus, categorical_features=[doc_ids], fraction=1e-3, epochs=1)
        model.to_cpu()
    serializers.save_hdf5("model.hdf5", model)
    model.top_words_per_topic("document_id", words)
Ejemplo n.º 9
0
def get_tokens():
    """
    :return:
    """
    docs = get_docs()
    texts = make_texts(docs, single=False)
    questions = get_questions()

    texts.extend(questions)

    texts = preprocess_text(texts)
    texts = [t for t in texts if t]

    tokens, vocab = preprocess.tokenize(texts,
                                        7500,
                                        tag=False,
                                        parse=False,
                                        entity=False)
    return tokens, vocab
            print x
            return x


sometext = [uniuncode(x) for x in alltext[:100]]

# max words grabbed per document
max_words = 10000

start = datetime.datetime.now()

# convert text to unicode (if not already)
# in my case text is already in unicode
# tokenize uses spacy under the hood
tokens, vocab = preprocess.tokenize(sometext,
                                    max_words,
                                    merge=False,
                                    n_threads=4)

print '1. tokens made'
# he made a generic corpus based on default dictionary
# see documentation
corpus = Corpus()

# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()

print '2. corpus updated'

# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
Ejemplo n.º 11
0
features = []
# Convert to unicode (spaCy only works with unicode)
features = pd.read_csv(fn, encoding='utf8', nrows=nrows)
# Convert all integer arrays to int32
for col, dtype in zip(features.columns, features.dtypes):
    if dtype is np.dtype('int64'):
        features[col] = features[col].astype('int32')

# Tokenize the texts
# If this fails it's likely spacy. Install a recent spacy version.
# Only the most recent versions have tokenization of noun phrases
# I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a32ff
# Also try running python -m spacy.en.download all --force
texts = features.pop('comment_text').values
tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4, merge=True)
del texts

# Make a ranked list of rare vs frequent words
corpus = Corpus()
corpus.update_word_count(tokens)
corpus.finalize()

# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=10)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
Ejemplo n.º 12
0
features = []
# Convert to unicode (spaCy only works with unicode)
features = pd.read_csv(fn, encoding='utf8', nrows=nrows)
# Convert all integer arrays to int32
for col, dtype in zip(features.columns, features.dtypes):
    if dtype is np.dtype('int64'):
        features[col] = features[col].astype('int32')

# Tokenize the texts
# If this fails it's likely spacy. Install a recent spacy version.
# Only the most recent versions have tokenization of noun phrases
# I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a32ff
# Also try running python -m spacy.en.download all --force
texts = features.pop('comment_text').values
tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4,
                                    merge=True)
del texts

# Make a ranked list of rare vs frequent words
corpus = Corpus()
corpus.update_word_count(tokens)
corpus.finalize()

# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=10)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
Ejemplo n.º 13
0
# Preprocess data
max_length = 10000  # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
# texts = [unicode(clean(d)) for d in texts]
texts = [clean(d) for d in texts]

# vocab - dictionary where keys are the loose index, and values are the word string.
# (Pdb) len(vocab) -> 74179
# (Pdb) type(vocab) -> <class 'dict'>
#
# tokens, 2D array, one row per document, columns are long hash# of words in the sequence of
# the occuring in the document (max words in one document is 10K)
# (Pdb) tokens.shape -> (11314, 10000)
tokens, vocab = preprocess.tokenize(texts,
                                    max_length,
                                    merge=False,
                                    attr=LEMMA,
                                    n_threads=8)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()

# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This will build the 2D array, one row per document, columns are word compact hash#
# in the sequence of the occuring in the document (max words in one document is 10K)
#
# (Pdb) len(tokens) -> 11314
# (Pdb) compact.shape -> (11314, 10000)
# (Pdb) compact.max() -> 74179
import pickle

from sklearn.datasets import fetch_20newsgroups
import numpy as np

from lda2vec import preprocess, Corpus

logging.basicConfig()
start = time.time()

# Fetch data
remove = ('headers', 'footers', 'quotes')
texts = fetch_20newsgroups(subset='train', remove=remove).data
# Preprocess data
max_length = 1000  # Limit of 1k words per document
tokens, vocab = preprocess.tokenize(texts, max_length)
print '0. Tokenizing done at %.1fs' % (time.time() - start)

#del texts
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
print '1. corpus.update_word_count done at %.1fs' % (time.time() - start)
corpus.finalize()
print '2. corpus.finalize done at %.1fs' % (time.time() - start)
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
print '3. corpus.to_compact done at %.1fs' % (time.time() - start)
# Remove extremely rare words
Ejemplo n.º 15
0
    #np.save('bow_'+id, bow) Does not seem to be neccessary for lda2vec_run.py
    np.save('vectors_' + id, vectors)


id = "honeypot_clean_revised"

model = "honeypot_clean_model_revised"

#tweets_fn = "../lda2vec/tweets_shuffled_no_links.txt"

#with open("sentences_no_stopwords.txt", "rb") as fp:   # Unpickling
#    sentences = pickle.load(fp)

sentences_part_1 = np.load("sentences_no_stopwords_revised_1.npy")
sentences_part_2 = np.load("sentences_no_stopwords_revised_2.npy")
sentences = np.append(sentences_part_1, sentences_part_2)
#sentences = np.concatenate(sentences_part_1, sentences_part_2)
assert sentences[len(sentences_part_1)] == sentences_part_2[0]

sentences = list_all_unicode(sentences)
print(sentences[:3])
print(np.shape(sentences))

print("starts to tokenize")
tokens, vocab = preprocess.tokenize(sentences,
                                    max_length=10000,
                                    merge=False,
                                    n_threads=4)
print("tokenized")
process_data(tokens, vocab, model=model)
print("finished.")
Ejemplo n.º 16
0
features = []
# Convert to unicode (spaCy only works with unicode)
features = pd.read_csv(fn, encoding='utf8', nrows=nrows)
# Convert all integer arrays to int64
for col, dtype in zip(features.columns, features.dtypes):
    if dtype is np.dtype('int64'):
        features[col] = features[col].astype('int64')

# Tokenize the texts
# If this fails it's likely spacy. Install a recent spacy version.
# Only the most recent versions have tokenization of noun phrases
# I'm using SHA dfd1a1d3a24b4ef5904975268c1bbb13ae1a64ff
# Also try running python -m spacy.en.download all --force
texts = features.pop('comment_text').values
tokens, vocab = preprocess.tokenize(texts, max_length, n_threads=4,
                                    merge=True)  ## if error due to this line
# then try setting 'merge = False'
del texts

# Make a ranked list of rare vs frequent words
corpus = Corpus()
corpus.update_word_count(tokens)
corpus.finalize()

# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=10)
# Words tend to have power law frequency, so selectively
Ejemplo n.º 17
0
from sklearn.datasets import fetch_20newsgroups
from chainer import serializers
import numpy as np
import os.path
import logging

logging.basicConfig()

# Fetch data
texts = fetch_20newsgroups(subset='train').data
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(d) for d in texts]

# Preprocess data
max_length = 10000   # Limit of 10k words per document
tokens, vocab = preprocess.tokenize(texts, max_length, tag=False,
                                    parse=False, entity=False)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=5)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
Ejemplo n.º 18
0
import logging

# Optional: moving the model to the GPU makes it ~10x faster
# set to False if you're having problems with Chainer and CUDA
gpu = cuda.available

logging.basicConfig()

# Fetch data
texts = fetch_20newsgroups(subset="train").data
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(d) for d in texts]

# Preprocess data
max_length = 1000  # Limit of 1k words per document
tokens, vocab = preprocess.tokenize(texts, max_length, tag=False, parse=False, entity=False)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=5)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)
# Now flatten a 2D array of document per row and word position
# per column to a 1D array of words. This will also remove skips
Ejemplo n.º 19
0
remove = ('headers', 'footers', 'quotes')
texts = fetch_20newsgroups(subset='train', remove=remove).data
# Remove tokens with these substrings
bad = set(["ax>", '`@("', '---', '===', '^^^'])


def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))


# Preprocess data
max_length = 10000  # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(clean(d)) for d in texts]
tokens, vocab = preprocess.tokenize(texts,
                                    max_length,
                                    merge=False,
                                    n_threads=4)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=30)
# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
Ejemplo n.º 20
0
# Fetch data
remove = ('headers', 'footers', 'quotes')
texts = fetch_20newsgroups(subset='train', remove=remove).data
# Remove tokens with these substrings
bad = set(["ax>", '`@("', '---', '===', '^^^'])


def clean(line):
    return ' '.join(w for w in line.split() if not any(t in w for t in bad))

# Preprocess data
max_length = 10000   # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
texts = [unicode(clean(d)) for d in texts]
tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,
                                    n_threads=4)
corpus = Corpus()
# Make a ranked list of rare vs frequent words
corpus.update_word_count(tokens)
corpus.finalize()
# The tokenization uses spaCy indices, and so may have gaps
# between indices for words that aren't present in our dataset.
# This builds a new compact index
compact = corpus.to_compact(tokens)
# Remove extremely rare words
pruned = corpus.filter_count(compact, min_count=30)
# Convert the compactified arrays into bag of words arrays
bow = corpus.compact_to_bow(pruned)
# Words tend to have power law frequency, so selectively
# downsample the most prevalent words
clean = corpus.subsample_frequent(pruned)