Esempio n. 1
0
def create_online_lda(docs, ids, name, numTopics):
  corpus, dictionary = docs2corpus(docs, name, True)
  print '>> generating online lda model...'
  lda = gensim.models.ldamodel.LdaModel(corpus, num_topics=numTopics, id2word=dictionary, passes=10)
  print lda
  lda.save(name + '.lda')
  return lda2topicMap(lda, corpus, ids, name), lda.show_topics(formatted=False)
Esempio n. 2
0
def content_model(data):
    dic = corpora.Dictionary(data)
    dic.save("dic.m")
    corpus = [dic.doc2bow(text) for text in data]
    corpora.MmCorpus.serialize("corpus.m", corpus)
    tfidf = models.TfidfModel(corpus)
    corpus_tfidf = tfidf[corpus]

    lda = models.LdaModel(corpus_tfidf, id2word=dic, num_topics=20, alpha=1)
    lda.save("ldaModel.model")
Esempio n. 3
0
# integer word id and returns the result as a sparse vector.
corpus = [dictionary.doc2bow(text) for text in texts]
# corpora.MmCorpus.serialize('raop.mm', corpus)
'''
Training the corpus using various models  
ONLY FIT THE MODEL ONCE BECAUSE TOPICS CHANGE AFTER EVERY RUN!
Load the saved model file to apply the model to new documents
'''

# Latent Dirichlet Allocation, LDA
lda = models.ldamodel.LdaModel(corpus,
                               id2word=dictionary,
                               num_topics=10,
                               passes=3)
corpus_lda = lda[corpus]
lda.save('model.lda')
'''
LOAD THE MODELS BEFORE TRYING TO RUN SIMILARITY QUERIES!
'''

lda = models.LdaModel.load('model.lda')

# applying the LDA model to identify topic for each request using
# similarity queries
docs = request_text_list
lda_topics = []
for doc in docs:
    vec_bow = dictionary.doc2bow(doc.lower().split())
    vec_lda = lda[vec_bow]
    vec_lda.sort(key=lambda item: -item[1])
    lda_topics.append(vec_lda[0][0])
Esempio n. 4
0
tfidf.save('model.tfidf')

# Latent Semantic Indexing, LSI (or sometimes LSA) 
lsi = models.lsimodel.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=10)
corpus_lsi = lsi[corpus_tfidf]
lsi.save('model.lsi')

# Random Projections, RP
rp = models.rpmodel.RpModel(corpus_tfidf, num_topics=10)
corpus_rp = rp[corpus_tfidf]
rp.save('model.rp')
    
# Latent Dirichlet Allocation, LDA
lda = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=10, passes=5)
corpus_lda = lda[corpus]
lda.save('model.lda')
    
# Hierarchical Dirichlet Process, HDP
hdp = models.hdpmodel.HdpModel(corpus, id2word=dictionary)
corpus_hdp = hdp[corpus]
hdp.save('model.hdp')

'''
LOAD THE MODELS BEFORE TRYING TO RUN SIMILARITY QUERIES!
'''

tfidf = models.TfidfModel.load('model.tfidf')
lsi = models.LsiModel.load('model.lsi')
rp = models.RpModel.load('model.rp')
lda = models.LdaModel.load('model.lda')
hdp = models.HdpModel.load('model.hdp')
Esempio n. 5
0
    def _loop_worker(self):
        """The main loop for performing real time analysis.

        Takes items from an analysis queue sequentially, forms mne epochs, and either uses the data for real time
        training or to predict the letter that was mind-typed.
        Structure is adapted from rteeg.rteeg.analysis._loop_worker.
        """
        sleep_time = 0.01  # Time to sleep between queries.

        while not self._kill_signal.is_set():
            # when items exist in the marker analysis queue
            if not self.m_stream.analyze.empty():
                print('Began analyzing data...')

                # get last eeg sample for analysis of the trial (0.02% second tolerance to always capture 1st event)
                ts = self.m_stream.remove_analysis()
                tmp = np.array(self.eeg_stream.data)
                end_index = int((np.abs(tmp[:, -1] - ts)).argmin() + 1 /
                                (1 / self.eeg_stream.info['sfreq']))

                # ensure there is enough eeg data before analyzing; wait if there isn't
                while len(self.eeg_stream.data) < end_index:
                    time.sleep(sleep_time)

                # Make an MNE epoch from channels 0-3 (EEG), decim = keep every nth sample
                epochs, identities, targets = self.eeg_stream.make_epochs(
                    self.m_stream,
                    end_index,
                    self.data_duration,
                    picks=[0, 1, 2, 3],
                    tmin=self.epoch_start_time,
                    tmax=self.epoch_end_time,
                    decim=self.decim)
                # get input to classifier
                print('Formatting data for classifier...')
                data = np.array(epochs.get_data())
                # since the sample frequency is 220 Hz/3 = 73.33 Hz, default indexes 8 and 55 is approximately 0.100 - 0.750 s
                data = data[:, :,
                            self.window_start_index:self.window_end_index]
                print('size of classifier-input: {}'.format(data.shape))
                print('size of identities: {}'.format(identities.shape))
                print('size of targets: {}'.format(targets.shape))

                # If training classifier, send data to classifier with ground truth targets
                if self.train:
                    self.train_number += data.shape[0]
                    if self.train_number < self.train_epochs:
                        self.train_data.extend(data)
                        self.train_targets.extend(targets)
                    else:
                        print('Training LDA classifier with {} epochs'.format(
                            self.train_number))
                        i, t = lda.create_input_target(
                            zip(self.train_targets, self.train_data))
                        classifier = lda.lda_train(i, t)
                        print("Finished training.")
                        lda.save(self.path, classifier)
                        self.train_number = 0
                # else do a prediction
                else:
                    classifier = lda.load(self.path)
                    i, t = lda.create_input_target(zip(targets, data))
                    prediction = lda.predict(i, classifier)
                    intermediate = 0
                    for index, item in enumerate(prediction):
                        # To account for the fact that every marker is associated with 4 channels, average the output
                        # of each channel or apply specific weights to each channel (possibly implement in future).
                        # Predictions for a single event based on 4 channels is appended to a list.
                        if (index + 1) % 4 == 0:
                            intermediate += item / 4
                            self.predictions.append(intermediate)
                            intermediate = 0
                        else:
                            intermediate += item / 4
            time.sleep(sleep_time)