Example #1
0
    def load_mini_batch(self):
        (docset, articlenames) = get_random_wikipedia_articles(self.batch_size)
        path_articles = save_articles_per_batch(docset, articlenames,
                                                self.data_wiki_folder)
        save_articles(docset, articlenames, self.data_wiki_folder)
        # add new terms to the vocabulary set
        raw_data = PreProcessing(path_articles,
                                 remove_rare_word=2,
                                 remove_common_word=0.5)
        raw_data.process()
        old_vocab = list()
        f_vocab = open(self.vocab_file)
        line = f_vocab.readline().strip()
        while line:
            old_vocab.append(line)
            line = f_vocab.readline().strip()
        f_vocab.close()
        new_vocab = set(raw_data.vocab)
        in_new_but_not_in_old = new_vocab - set(old_vocab)
        result_vocab = old_vocab + list(in_new_but_not_in_old)
        self.vocab_file = self.data_wiki_folder + '/current_vocab.txt'
        f_new_vocab = open(self.vocab_file, 'w')
        for term in result_vocab:
            f_new_vocab.write(term + '\n')
        f_new_vocab.close()
        # create corpus to store mini-batch
        dict_vocab = read_vocab(self.vocab_file)
        corpus = Corpus(DataFormat.TERM_SEQUENCE)
        for doc in raw_data.list_doc:
            for i in range(len(doc)):
                doc[i] = dict_vocab[raw_data.vocab[doc[i]]]
            if len(doc) > 0:
                corpus.append_doc(doc, len(doc))

        logging.info("Mini batch no: %s", self.mini_batch_no)
        if self.output_format == DataFormat.TERM_FREQUENCY:
            mini_batch = utilizies.convert_corpus_format(
                corpus, DataFormat.TERM_FREQUENCY)
        else:
            mini_batch = corpus
        self.mini_batch_no += 1
        return mini_batch
Example #2
0
 def infer_new_docs(self, new_corpus):
     docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY)
     theta, index = self.e_step(docs.word_ids_tks, docs.cts_lens)
     return theta
Example #3
0
 def infer_new_docs(self, new_corpus):
     docs = convert_corpus_format(new_corpus, DataFormat.TERM_SEQUENCE)
     N_phi, N_Z, theta = self.e_step(docs.word_ids_tks, docs.cts_lens)
     return theta
Example #4
0
 def infer_new_docs(self, new_corpus):
     docs = convert_corpus_format(new_corpus, DataFormat.TERM_SEQUENCE)
     theta, z = self.sample_z(docs.word_ids_tks, docs.cts_lens)
     return theta
Example #5
0
 def infer_new_docs(self, new_corpus):
     docs = convert_corpus_format(new_corpus, DataFormat.TERM_FREQUENCY)
     gamma, sstats = self.e_step(docs.word_ids_tks, docs.cts_lens)
     gamma_norm = gamma.sum(axis=1)
     theta = gamma / gamma_norm[:, n.newaxis]
     return theta