Python Document.parse_from_tokens 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: common.document

클래스/타입: Document

메소드/함수: parse_from_tokens

hotexamples.com에서의 예제들: 5

Python Document.parse_from_tokens - 5개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 common.document.Document.parse_from_tokens에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Document(7)

parse_from_tokens(3)

construct(2)

num_words(2)

decrease_topic(1)

get_words(1)

increase_topic(1)

parse_from_string(1)

set_emails(1)

set_hash_tags(1)

set_links(1)

set_sentences(1)

set_tokens(1)

예제 #1

파일 보기

파일: sparselda_train_gibbs_sampler.py 프로젝트: JackieXie168/mltk

    def load_corpus(self, corpus_dir):
        """Load corpus from a given directory, then initialize the documents
        and model.
        Line format: token1 \t token2 \t token3 \t ... ...
        """
        self.documents = []
        rand = random.Random()

        logging.info('Load corpus from %s.' % corpus_dir)
        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                logging.info('Load filename %s.' % filename)
                fp = open(filename, 'r')
                for doc_str in fp.readlines():
                    doc_str = doc_str.decode('gbk')
                    doc_tokens = doc_str.strip().split('\t')
                    if len(doc_tokens) < 2:
                        continue
                    document = Document(self.model.num_topics)
                    document.parse_from_tokens(doc_tokens, rand, self.vocabulary)
                    if document.num_words() < 2:
                        continue
                    self.documents.append(document)
                fp.close()

        logging.info('The document number is %d.' % len(self.documents))
        self._initialize_model()

        self._compute_smoothing_only_bucket()
        self._initialize_topic_word_coefficient()

예제 #2

파일 보기

    def _inference_one_chain(self, doc_tokens, rand):
        """Inference topics with one markov chain.

        Returns the sparse topics p(z|d).
        """
        document = Document(self.model.num_topics)
        document.parse_from_tokens(doc_tokens, rand, self.vocabulary,
                                   self.model)
        if document.num_words() == 0:
            return dict()

        accumulated_topic_hist = {}
        for i in xrange(self.total_iterations):
            # one iteration
            for word in document.words:
                # --
                document.decrease_topic(word.topic, 1)

                new_topic = self._sample_word_topic(document, word.id, rand)
                assert new_topic != None
                word.topic = new_topic
                # ++
                document.increase_topic(new_topic, 1)

            if i >= self.burn_in_iterations:
                for non_zero in document.doc_topic_hist.non_zeros:
                    if non_zero.topic in accumulated_topic_hist:
                        accumulated_topic_hist[
                            non_zero.topic] += non_zero.count
                    else:
                        accumulated_topic_hist[non_zero.topic] = non_zero.count

        topic_dist = self._l1normalize_distribution(accumulated_topic_hist)
        return topic_dist

예제 #3

파일 보기

    def load_corpus(self, corpus_dir):
        """Load corpus from a given directory, then initialize the documents
        and model.
        Line format: token1 \t token2 \t token3 \t ... ...
        """
        self.documents = []
        rand = random.Random()

        logging.info('Load corpus from %s.' % corpus_dir)
        for root, dirs, files in os.walk(corpus_dir):
            for f in files:
                filename = os.path.join(root, f)
                logging.info('Load filename %s.' % filename)
                fp = open(filename, 'r')
                for doc_str in fp.readlines():
                    doc_str = doc_str.decode('gbk')
                    doc_tokens = doc_str.strip().split('\t')
                    if len(doc_tokens) < 2:
                        continue
                    document = Document(self.model.num_topics)
                    document.parse_from_tokens(doc_tokens, rand,
                                               self.vocabulary)
                    if document.num_words() < 2:
                        continue
                    self.documents.append(document)
                fp.close()

        logging.info('The document number is %d.' % len(self.documents))
        self._initialize_model()

        self._compute_smoothing_only_bucket()
        self._initialize_topic_word_coefficient()

예제 #4

파일 보기

파일: sparselda_gibbs_sampler.py 프로젝트: Harvey-Ai/mltk

    def _inference_one_chain(self, doc_tokens, rand):
        """Inference topics with one markov chain.

        Returns the sparse topics p(z|d).
        """
        document = Document(self.model.num_topics)
        document.parse_from_tokens(doc_tokens, rand,
                self.vocabulary, self.model)
        if document.num_words() == 0:
            return dict()

        accumulated_topic_hist = {}
        for i in xrange(self.total_iterations):
            # one iteration
            for word in document.words:
                # --
                document.decrease_topic(word.topic, 1)

                new_topic = self._sample_word_topic(document, word.id, rand)
                assert new_topic != None
                word.topic = new_topic
                # ++
                document.increase_topic(new_topic, 1)

            if i >= self.burn_in_iterations:
                for non_zero in document.doc_topic_hist.non_zeros:
                    if non_zero.topic in accumulated_topic_hist:
                        accumulated_topic_hist[non_zero.topic] += non_zero.count
                    else:
                        accumulated_topic_hist[non_zero.topic] = non_zero.count

        topic_dist = self._l1normalize_distribution(accumulated_topic_hist)
        return topic_dist

예제 #5

파일 보기

 def test_compute_loglikelihood(self):
     doc_tokens = ['macbook', 'ipad',  # exist in vocabulary and model
             'mac os x', 'chrome',  # only exist in vocabulary
             'nokia', 'null']  # inexistent
     document = Document(self.model.num_topics)
     rand = random.Random()
     rand.seed(0)
     document.parse_from_tokens(
             doc_tokens, rand, self.vocabulary, self.model)
     documents = [document, document]
     self.assertEqual(-14.113955684239654,
             self.model_evaluator.compute_loglikelihood(documents))