Beispiel #1
0
 def __init__(self, n_topics, name=''):
     super(MyLda, self).__init__()
     """
     Initializes a model without training.
     """
     self.K = n_topics
     assert (type(name) == str)
     if name == '':
         self.name = 'lda_' + str(self.K)
     else:
         self.name = name
     self.tokenizer = Tokenizer()
     self.has_vocab = False
     self.has_corpus = False
     self.is_trained = False
     self.has_viz_data = False
Beispiel #2
0
 def __init__(self, n_topics, name=''):
     super(MyLda, self).__init__()
     """
     Initializes a model without training.
     """
     self.K = n_topics
     assert(type(name) == str)
     if name == '':
         self.name = 'lda_'+str(self.K)
     else:
         self.name = name
     self.tokenizer = Tokenizer()
     self.has_vocab = False
     self.has_corpus = False
     self.is_trained = False
     self.has_viz_data = False
Beispiel #3
0
class MyLda(object):
    """
    Wrapper around gensim LDA model with utilities for
    saving/loading/preprocessing text and corpus objects,
    parsing output, and exporting model summary to disk.
    """
    def __init__(self, n_topics, name=''):
        super(MyLda, self).__init__()
        """
        Initializes a model without training.
        """
        self.K = n_topics
        assert (type(name) == str)
        if name == '':
            self.name = 'lda_' + str(self.K)
        else:
            self.name = name
        self.tokenizer = Tokenizer()
        self.has_vocab = False
        self.has_corpus = False
        self.is_trained = False
        self.has_viz_data = False

    def load_vocab(self, fn):
        """loads a gensim vocab.dict object from disk."""
        vocab = load_vocab(fn)
        self.vocab = vocab
        self.has_vocab = True

    def load_corpus(self, fn):
        """loads a gensim SvmLightCorpus object from disk."""
        corpus = load_corpus(fn)
        self.corpus = corpus
        self.has_corpus = True

    def _process_texts(self, texts, generator=False):
        """ Gets vocab and corpus from a list of strings. """
        wordlists = [
            x for x in [self.tokenizer.tokenize(doc) for doc in texts]
            if x != []
        ]
        self.vocab = corpora.Dictionary(wordlists)
        self.has_vocab = True
        self.corpus = (self.vocab.doc2bow(doc) for doc in wordlists)
        if not generator:
            self.corpus = list(self.corpus)
        self.has_corpus = True

    def parse_topics(self, n=10):
        """
        Parses the model's topics into lists of top words, in decreasing
        sorted order of probability under that topic. 
        """
        assert (self.is_trained)
        raw_topics = self._lda_model.print_topics(self._lda_model.num_topics)
        topics = map(lambda x: x.split(' + '), raw_topics)
        top_words = [
            map(lambda x: x.split('*')[1], topic[:n]) for topic in topics
        ]
        self.topics = top_words
        self.has_topics = True
        return top_words

    def describe_topic(self, index):
        """ Spits out a description of the the topic. """
        assert (self.has_topics)
        assert (0 <= index < self.K)
        return self.topics[index]

    def fit(self, pnos, texts=None, from_loaded=False):
        """fits a model from an iterable of strings (full, unparsed docs). """
        self.pnos = pnos
        assert ((texts is not None) or from_loaded)
        if texts is not None:
            self._process_texts(texts)
        else:
            assert (self.has_vocab and self.has_corpus)
        self._lda_model = ldamodel.LdaModel(corpus=self.corpus,
                                            id2word=self.vocab,
                                            num_topics=self.K)
        self.is_trained = True
        _ = self.parse_topics()

    def doc_topics(self, docs):
        """
        Get the vectors of topic strengths for the given docs (strings). 
        TODO: does this deal with out of vocabulary tokens? NBD. 
        """
        assert (self.has_vocab)
        assert (self.is_trained)
        tknzd = [self.tokenizer.tokenize(doc) for doc in docs]
        bows = [self.vocab.doc2bow(tkns) for tkns in tknzd]
        return [self._lda_model[bow] for bow in bows]

    def save(self, outdir, just_lda=False):
        """ save all files"""
        if not just_lda:
            pnofn = '/'.join([outdir, 'pnos.p'])
            vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict'])
            corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight'])
            if self.pnos is not None:
                pickle_obj(pnofn, self.pnos)
            self.vocab.save(vocabfn)
            corpora.SvmLightCorpus.serialize(corpusfn, self.corpus)
        ldafn = '/'.join([outdir, self.name + '.lda'])
        self._lda_model.save(ldafn)

    def visualize(self, outfn):
        """
        Produce a pyLDAvis visualization of a model and save to disk at the given location.
        """
        if self.has_viz_data:
            pyLDAvis.save_html(self.vis_data, outfn)
            return
        assert (self.has_vocab and self.has_corpus)
        assert (self.is_trained)
        # this might crash. I think because corpus, vocab, and _lda_model are all big.
        self.vis_data = prepare(self._lda_model, self.corpus, self.vocab)
        self.has_viz_data = True
        pyLDAvis.save_html(self.vis_data, outfn)

    def export(self, outdir, topic_docs=None):
        """ 
        Produce a "model report". 
        Export  parsed topics, doc topics, and visualizatoin.
        topic_docs should be a tuple (pnos, texts) if not None.
        """
        parsed_topics_fn = outdir + '/parsed_topics_' + self.name + '.csv'
        parsed_topics = self.parse_topics()
        with open(parsed_topics_fn, 'wb') as outfile:
            writer = csv.writer(outfile)
            writer.writerow(['topic index', 'top words'])
            for i, t in enumerate(parsed_topics):
                writer.writerow([i] + t)
        if topic_docs is not None:
            doc_tops_fn = outdir + '/doc_topics_' + self.name + '.csv'
            pnos, texts = topic_docs
            doc_tops = self.doc_topics(texts)
            with open(doc_tops_fn, 'wb') as outfile:
                writer = csv.writer(outfile)
                writer.writerow(['pno', 'top 10 topics'])
                for pno, dts in zip(pnos, doc_tops):
                    writer.writerow([pno] + dts)
        visualize_fn = outdir + '/vis' + self.name + '.html'
        self.visualize(visualize_fn)
Beispiel #4
0
class MyLda(object):
    """
    Wrapper around gensim LDA model with utilities for
    saving/loading/preprocessing text and corpus objects,
    parsing output, and exporting model summary to disk.
    """
    def __init__(self, n_topics, name=''):
        super(MyLda, self).__init__()
        """
        Initializes a model without training.
        """
        self.K = n_topics
        assert(type(name) == str)
        if name == '':
            self.name = 'lda_'+str(self.K)
        else:
            self.name = name
        self.tokenizer = Tokenizer()
        self.has_vocab = False
        self.has_corpus = False
        self.is_trained = False
        self.has_viz_data = False
        
    def load_vocab(self, fn):
        """loads a gensim vocab.dict object from disk."""
        vocab = load_vocab(fn)
        self.vocab = vocab
        self.has_vocab = True

    def load_corpus(self, fn):
        """loads a gensim SvmLightCorpus object from disk."""
        corpus = load_corpus(fn)
        self.corpus = corpus
        self.has_corpus = True

    def _process_texts(self, texts, generator = False):
        """ Gets vocab and corpus from a list of strings. """
        wordlists = [
            x for x in [self.tokenizer.tokenize(doc) for doc in texts] if x != []
        ]
        self.vocab = corpora.Dictionary(wordlists)
        self.has_vocab = True
        self.corpus = (self.vocab.doc2bow(doc) for doc in wordlists)
        if not generator:
            self.corpus = list(self.corpus)
        self.has_corpus = True

    def parse_topics(self, n=10):
        """
        Parses the model's topics into lists of top words, in decreasing
        sorted order of probability under that topic. 
        """
        assert(self.is_trained)
        raw_topics = self._lda_model.print_topics(self._lda_model.num_topics)
        topics = map(lambda x: x.split(' + '), raw_topics)
        top_words = [
            map(
                lambda x: x.split('*')[1], 
                topic[:n]
            ) 
            for topic in topics]
        self.topics = top_words
        self.has_topics = True
        return top_words

    def describe_topic(self, index):
        """ Spits out a description of the the topic. """
        assert(self.has_topics)
        assert(0 <= index < self.K)
        return self.topics[index]
        

    def fit(self, pnos, texts = None, from_loaded = False):
        """fits a model from an iterable of strings (full, unparsed docs). """
        self.pnos = pnos
        assert((texts is not None) or from_loaded)
        if texts is not None:
            self._process_texts(texts)
        else:
            assert(self.has_vocab and self.has_corpus)
        self._lda_model = ldamodel.LdaModel(
            corpus=self.corpus, 
            id2word=self.vocab,
            num_topics=self.K

        )
        self.is_trained = True
        _ = self.parse_topics()

    def doc_topics(self, docs):
        """
        Get the vectors of topic strengths for the given docs (strings). 
        TODO: does this deal with out of vocabulary tokens? NBD. 
        """
        assert(self.has_vocab)
        assert(self.is_trained)
        tknzd = [self.tokenizer.tokenize(doc) for doc in docs]
        bows = [self.vocab.doc2bow(tkns) for tkns in tknzd]
        return [self._lda_model[bow] for bow in bows]

    def save(self, outdir, just_lda=False):
        """ save all files"""
        if not just_lda:
            pnofn = '/'.join([outdir, 'pnos.p'])
            vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict'])
            corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight'])
            if self.pnos is not None:
                pickle_obj(pnofn, self.pnos)
            self.vocab.save(vocabfn)
            corpora.SvmLightCorpus.serialize(corpusfn, self.corpus)
        ldafn = '/'.join([outdir,self.name+'.lda'])
        self._lda_model.save(ldafn)

    def visualize(self, outfn):
        """
        Produce a pyLDAvis visualization of a model and save to disk at the given location.
        """
        if self.has_viz_data:
            pyLDAvis.save_html(self.vis_data, outfn)
            return
        assert(self.has_vocab and self.has_corpus)
        assert(self.is_trained)
        # this might crash. I think because corpus, vocab, and _lda_model are all big. 
        self.vis_data = prepare(self._lda_model, self.corpus, self.vocab)
        self.has_viz_data = True
        pyLDAvis.save_html(self.vis_data, outfn)

    def export(self, outdir, topic_docs = None):
        """ 
        Produce a "model report". 
        Export  parsed topics, doc topics, and visualizatoin.
        topic_docs should be a tuple (pnos, texts) if not None.
        """
        parsed_topics_fn = outdir+'/parsed_topics_'+self.name+'.csv'
        parsed_topics = self.parse_topics()
        with open(parsed_topics_fn, 'wb') as outfile:
            writer = csv.writer(outfile)
            writer.writerow(['topic index', 'top words'])
            for i,t in enumerate(parsed_topics):
                writer.writerow([i]+t)
        if topic_docs is not None:
            doc_tops_fn = outdir+'/doc_topics_'+self.name+'.csv'
            pnos,texts = topic_docs
            doc_tops = self.doc_topics(texts)
            with open(doc_tops_fn, 'wb') as outfile:
                writer = csv.writer(outfile)
                writer.writerow(['pno', 'top 10 topics'])
                for pno,dts in zip(pnos, doc_tops):
                    writer.writerow([pno]+dts)
        visualize_fn = outdir+'/vis'+self.name+'.html'
        self.visualize(visualize_fn)
Beispiel #5
0
import sys
from collections import defaultdict
from pymongo import MongoClient
from gensim import models
from gensim import matutils
from sklearn.externals import joblib
from alife.mockdb import get_mock
from alife.txtmine import stemmer
from alife.util import model_loader
from alife.util.general import cosine_dist, euclidean_dist, save_dict
from alife.txtmine.tokenizer import Tokenizer
from alife.visualize.w2v_vis import embedding_fig
from pprint import pprint

_db = MongoClient().patents
_tokenizer = Tokenizer()
_friendly_patents = [('zeolites', 4061724), ('semiconductors', 4064521),
                     ('nonwoven webs', 4340563), ('rsa', 4405829),
                     ('stents', 4655771), ('pcr', 4683202),
                     ('bubble jet', 4723129), ('cell phone', 5103459),
                     ('microarrays', 5143854), ('browser', 5572643)]
_names, _pnos = zip(*_friendly_patents)


def _dist(v1, v2):
    return np.dot(matutils.unitvec(v1), matutils.unitvec(v2))


def load_w2v(filename):
    #Loads a word2vec model stored at the given location.
    return models.word2vec.Word2Vec.load(filename)