def __init__(self, n_topics, name=''): super(MyLda, self).__init__() """ Initializes a model without training. """ self.K = n_topics assert (type(name) == str) if name == '': self.name = 'lda_' + str(self.K) else: self.name = name self.tokenizer = Tokenizer() self.has_vocab = False self.has_corpus = False self.is_trained = False self.has_viz_data = False
def __init__(self, n_topics, name=''): super(MyLda, self).__init__() """ Initializes a model without training. """ self.K = n_topics assert(type(name) == str) if name == '': self.name = 'lda_'+str(self.K) else: self.name = name self.tokenizer = Tokenizer() self.has_vocab = False self.has_corpus = False self.is_trained = False self.has_viz_data = False
class MyLda(object): """ Wrapper around gensim LDA model with utilities for saving/loading/preprocessing text and corpus objects, parsing output, and exporting model summary to disk. """ def __init__(self, n_topics, name=''): super(MyLda, self).__init__() """ Initializes a model without training. """ self.K = n_topics assert (type(name) == str) if name == '': self.name = 'lda_' + str(self.K) else: self.name = name self.tokenizer = Tokenizer() self.has_vocab = False self.has_corpus = False self.is_trained = False self.has_viz_data = False def load_vocab(self, fn): """loads a gensim vocab.dict object from disk.""" vocab = load_vocab(fn) self.vocab = vocab self.has_vocab = True def load_corpus(self, fn): """loads a gensim SvmLightCorpus object from disk.""" corpus = load_corpus(fn) self.corpus = corpus self.has_corpus = True def _process_texts(self, texts, generator=False): """ Gets vocab and corpus from a list of strings. """ wordlists = [ x for x in [self.tokenizer.tokenize(doc) for doc in texts] if x != [] ] self.vocab = corpora.Dictionary(wordlists) self.has_vocab = True self.corpus = (self.vocab.doc2bow(doc) for doc in wordlists) if not generator: self.corpus = list(self.corpus) self.has_corpus = True def parse_topics(self, n=10): """ Parses the model's topics into lists of top words, in decreasing sorted order of probability under that topic. """ assert (self.is_trained) raw_topics = self._lda_model.print_topics(self._lda_model.num_topics) topics = map(lambda x: x.split(' + '), raw_topics) top_words = [ map(lambda x: x.split('*')[1], topic[:n]) for topic in topics ] self.topics = top_words self.has_topics = True return top_words def describe_topic(self, index): """ Spits out a description of the the topic. """ assert (self.has_topics) assert (0 <= index < self.K) return self.topics[index] def fit(self, pnos, texts=None, from_loaded=False): """fits a model from an iterable of strings (full, unparsed docs). """ self.pnos = pnos assert ((texts is not None) or from_loaded) if texts is not None: self._process_texts(texts) else: assert (self.has_vocab and self.has_corpus) self._lda_model = ldamodel.LdaModel(corpus=self.corpus, id2word=self.vocab, num_topics=self.K) self.is_trained = True _ = self.parse_topics() def doc_topics(self, docs): """ Get the vectors of topic strengths for the given docs (strings). TODO: does this deal with out of vocabulary tokens? NBD. """ assert (self.has_vocab) assert (self.is_trained) tknzd = [self.tokenizer.tokenize(doc) for doc in docs] bows = [self.vocab.doc2bow(tkns) for tkns in tknzd] return [self._lda_model[bow] for bow in bows] def save(self, outdir, just_lda=False): """ save all files""" if not just_lda: pnofn = '/'.join([outdir, 'pnos.p']) vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict']) corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight']) if self.pnos is not None: pickle_obj(pnofn, self.pnos) self.vocab.save(vocabfn) corpora.SvmLightCorpus.serialize(corpusfn, self.corpus) ldafn = '/'.join([outdir, self.name + '.lda']) self._lda_model.save(ldafn) def visualize(self, outfn): """ Produce a pyLDAvis visualization of a model and save to disk at the given location. """ if self.has_viz_data: pyLDAvis.save_html(self.vis_data, outfn) return assert (self.has_vocab and self.has_corpus) assert (self.is_trained) # this might crash. I think because corpus, vocab, and _lda_model are all big. self.vis_data = prepare(self._lda_model, self.corpus, self.vocab) self.has_viz_data = True pyLDAvis.save_html(self.vis_data, outfn) def export(self, outdir, topic_docs=None): """ Produce a "model report". Export parsed topics, doc topics, and visualizatoin. topic_docs should be a tuple (pnos, texts) if not None. """ parsed_topics_fn = outdir + '/parsed_topics_' + self.name + '.csv' parsed_topics = self.parse_topics() with open(parsed_topics_fn, 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['topic index', 'top words']) for i, t in enumerate(parsed_topics): writer.writerow([i] + t) if topic_docs is not None: doc_tops_fn = outdir + '/doc_topics_' + self.name + '.csv' pnos, texts = topic_docs doc_tops = self.doc_topics(texts) with open(doc_tops_fn, 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['pno', 'top 10 topics']) for pno, dts in zip(pnos, doc_tops): writer.writerow([pno] + dts) visualize_fn = outdir + '/vis' + self.name + '.html' self.visualize(visualize_fn)
class MyLda(object): """ Wrapper around gensim LDA model with utilities for saving/loading/preprocessing text and corpus objects, parsing output, and exporting model summary to disk. """ def __init__(self, n_topics, name=''): super(MyLda, self).__init__() """ Initializes a model without training. """ self.K = n_topics assert(type(name) == str) if name == '': self.name = 'lda_'+str(self.K) else: self.name = name self.tokenizer = Tokenizer() self.has_vocab = False self.has_corpus = False self.is_trained = False self.has_viz_data = False def load_vocab(self, fn): """loads a gensim vocab.dict object from disk.""" vocab = load_vocab(fn) self.vocab = vocab self.has_vocab = True def load_corpus(self, fn): """loads a gensim SvmLightCorpus object from disk.""" corpus = load_corpus(fn) self.corpus = corpus self.has_corpus = True def _process_texts(self, texts, generator = False): """ Gets vocab and corpus from a list of strings. """ wordlists = [ x for x in [self.tokenizer.tokenize(doc) for doc in texts] if x != [] ] self.vocab = corpora.Dictionary(wordlists) self.has_vocab = True self.corpus = (self.vocab.doc2bow(doc) for doc in wordlists) if not generator: self.corpus = list(self.corpus) self.has_corpus = True def parse_topics(self, n=10): """ Parses the model's topics into lists of top words, in decreasing sorted order of probability under that topic. """ assert(self.is_trained) raw_topics = self._lda_model.print_topics(self._lda_model.num_topics) topics = map(lambda x: x.split(' + '), raw_topics) top_words = [ map( lambda x: x.split('*')[1], topic[:n] ) for topic in topics] self.topics = top_words self.has_topics = True return top_words def describe_topic(self, index): """ Spits out a description of the the topic. """ assert(self.has_topics) assert(0 <= index < self.K) return self.topics[index] def fit(self, pnos, texts = None, from_loaded = False): """fits a model from an iterable of strings (full, unparsed docs). """ self.pnos = pnos assert((texts is not None) or from_loaded) if texts is not None: self._process_texts(texts) else: assert(self.has_vocab and self.has_corpus) self._lda_model = ldamodel.LdaModel( corpus=self.corpus, id2word=self.vocab, num_topics=self.K ) self.is_trained = True _ = self.parse_topics() def doc_topics(self, docs): """ Get the vectors of topic strengths for the given docs (strings). TODO: does this deal with out of vocabulary tokens? NBD. """ assert(self.has_vocab) assert(self.is_trained) tknzd = [self.tokenizer.tokenize(doc) for doc in docs] bows = [self.vocab.doc2bow(tkns) for tkns in tknzd] return [self._lda_model[bow] for bow in bows] def save(self, outdir, just_lda=False): """ save all files""" if not just_lda: pnofn = '/'.join([outdir, 'pnos.p']) vocabfn = '/'.join([outdir, 'vocab_' + self.name + '.dict']) corpusfn = '/'.join([outdir, 'corpus_' + self.name + '.svmlight']) if self.pnos is not None: pickle_obj(pnofn, self.pnos) self.vocab.save(vocabfn) corpora.SvmLightCorpus.serialize(corpusfn, self.corpus) ldafn = '/'.join([outdir,self.name+'.lda']) self._lda_model.save(ldafn) def visualize(self, outfn): """ Produce a pyLDAvis visualization of a model and save to disk at the given location. """ if self.has_viz_data: pyLDAvis.save_html(self.vis_data, outfn) return assert(self.has_vocab and self.has_corpus) assert(self.is_trained) # this might crash. I think because corpus, vocab, and _lda_model are all big. self.vis_data = prepare(self._lda_model, self.corpus, self.vocab) self.has_viz_data = True pyLDAvis.save_html(self.vis_data, outfn) def export(self, outdir, topic_docs = None): """ Produce a "model report". Export parsed topics, doc topics, and visualizatoin. topic_docs should be a tuple (pnos, texts) if not None. """ parsed_topics_fn = outdir+'/parsed_topics_'+self.name+'.csv' parsed_topics = self.parse_topics() with open(parsed_topics_fn, 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['topic index', 'top words']) for i,t in enumerate(parsed_topics): writer.writerow([i]+t) if topic_docs is not None: doc_tops_fn = outdir+'/doc_topics_'+self.name+'.csv' pnos,texts = topic_docs doc_tops = self.doc_topics(texts) with open(doc_tops_fn, 'wb') as outfile: writer = csv.writer(outfile) writer.writerow(['pno', 'top 10 topics']) for pno,dts in zip(pnos, doc_tops): writer.writerow([pno]+dts) visualize_fn = outdir+'/vis'+self.name+'.html' self.visualize(visualize_fn)
import sys from collections import defaultdict from pymongo import MongoClient from gensim import models from gensim import matutils from sklearn.externals import joblib from alife.mockdb import get_mock from alife.txtmine import stemmer from alife.util import model_loader from alife.util.general import cosine_dist, euclidean_dist, save_dict from alife.txtmine.tokenizer import Tokenizer from alife.visualize.w2v_vis import embedding_fig from pprint import pprint _db = MongoClient().patents _tokenizer = Tokenizer() _friendly_patents = [('zeolites', 4061724), ('semiconductors', 4064521), ('nonwoven webs', 4340563), ('rsa', 4405829), ('stents', 4655771), ('pcr', 4683202), ('bubble jet', 4723129), ('cell phone', 5103459), ('microarrays', 5143854), ('browser', 5572643)] _names, _pnos = zip(*_friendly_patents) def _dist(v1, v2): return np.dot(matutils.unitvec(v1), matutils.unitvec(v2)) def load_w2v(filename): #Loads a word2vec model stored at the given location. return models.word2vec.Word2Vec.load(filename)