def create_online_lda(docs, ids, name, numTopics): corpus, dictionary = docs2corpus(docs, name, True) print '>> generating online lda model...' lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=numTopics, id2word=dictionary, passes=10) print lda lda.save(name + '.lda') return lda2topicMap(lda, corpus, ids, name), lda.show_topics(formatted=False)
def content_model(data): dic = corpora.Dictionary(data) dic.save("dic.m") corpus = [dic.doc2bow(text) for text in data] corpora.MmCorpus.serialize("corpus.m", corpus) tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=20, alpha=1) lda.save("ldaModel.model")
# integer word id and returns the result as a sparse vector. corpus = [dictionary.doc2bow(text) for text in texts] # corpora.MmCorpus.serialize('raop.mm', corpus) ''' Training the corpus using various models ONLY FIT THE MODEL ONCE BECAUSE TOPICS CHANGE AFTER EVERY RUN! Load the saved model file to apply the model to new documents ''' # Latent Dirichlet Allocation, LDA lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=10, passes=3) corpus_lda = lda[corpus] lda.save('model.lda') ''' LOAD THE MODELS BEFORE TRYING TO RUN SIMILARITY QUERIES! ''' lda = models.LdaModel.load('model.lda') # applying the LDA model to identify topic for each request using # similarity queries docs = request_text_list lda_topics = [] for doc in docs: vec_bow = dictionary.doc2bow(doc.lower().split()) vec_lda = lda[vec_bow] vec_lda.sort(key=lambda item: -item[1]) lda_topics.append(vec_lda[0][0])
tfidf.save('model.tfidf') # Latent Semantic Indexing, LSI (or sometimes LSA) lsi = models.lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10) corpus_lsi = lsi[corpus_tfidf] lsi.save('model.lsi') # Random Projections, RP rp = models.rpmodel.RpModel(corpus_tfidf, num_topics=10) corpus_rp = rp[corpus_tfidf] rp.save('model.rp') # Latent Dirichlet Allocation, LDA lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=10, passes=5) corpus_lda = lda[corpus] lda.save('model.lda') # Hierarchical Dirichlet Process, HDP hdp = models.hdpmodel.HdpModel(corpus, id2word=dictionary) corpus_hdp = hdp[corpus] hdp.save('model.hdp') ''' LOAD THE MODELS BEFORE TRYING TO RUN SIMILARITY QUERIES! ''' tfidf = models.TfidfModel.load('model.tfidf') lsi = models.LsiModel.load('model.lsi') rp = models.RpModel.load('model.rp') lda = models.LdaModel.load('model.lda') hdp = models.HdpModel.load('model.hdp')
def _loop_worker(self): """The main loop for performing real time analysis. Takes items from an analysis queue sequentially, forms mne epochs, and either uses the data for real time training or to predict the letter that was mind-typed. Structure is adapted from rteeg.rteeg.analysis._loop_worker. """ sleep_time = 0.01 # Time to sleep between queries. while not self._kill_signal.is_set(): # when items exist in the marker analysis queue if not self.m_stream.analyze.empty(): print('Began analyzing data...') # get last eeg sample for analysis of the trial (0.02% second tolerance to always capture 1st event) ts = self.m_stream.remove_analysis() tmp = np.array(self.eeg_stream.data) end_index = int((np.abs(tmp[:, -1] - ts)).argmin() + 1 / (1 / self.eeg_stream.info['sfreq'])) # ensure there is enough eeg data before analyzing; wait if there isn't while len(self.eeg_stream.data) < end_index: time.sleep(sleep_time) # Make an MNE epoch from channels 0-3 (EEG), decim = keep every nth sample epochs, identities, targets = self.eeg_stream.make_epochs( self.m_stream, end_index, self.data_duration, picks=[0, 1, 2, 3], tmin=self.epoch_start_time, tmax=self.epoch_end_time, decim=self.decim) # get input to classifier print('Formatting data for classifier...') data = np.array(epochs.get_data()) # since the sample frequency is 220 Hz/3 = 73.33 Hz, default indexes 8 and 55 is approximately 0.100 - 0.750 s data = data[:, :, self.window_start_index:self.window_end_index] print('size of classifier-input: {}'.format(data.shape)) print('size of identities: {}'.format(identities.shape)) print('size of targets: {}'.format(targets.shape)) # If training classifier, send data to classifier with ground truth targets if self.train: self.train_number += data.shape[0] if self.train_number < self.train_epochs: self.train_data.extend(data) self.train_targets.extend(targets) else: print('Training LDA classifier with {} epochs'.format( self.train_number)) i, t = lda.create_input_target( zip(self.train_targets, self.train_data)) classifier = lda.lda_train(i, t) print("Finished training.") lda.save(self.path, classifier) self.train_number = 0 # else do a prediction else: classifier = lda.load(self.path) i, t = lda.create_input_target(zip(targets, data)) prediction = lda.predict(i, classifier) intermediate = 0 for index, item in enumerate(prediction): # To account for the fact that every marker is associated with 4 channels, average the output # of each channel or apply specific weights to each channel (possibly implement in future). # Predictions for a single event based on 4 channels is appended to a list. if (index + 1) % 4 == 0: intermediate += item / 4 self.predictions.append(intermediate) intermediate = 0 else: intermediate += item / 4 time.sleep(sleep_time)